{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "41c0e75c",
   "metadata": {},
   "outputs": [],
   "source": [
    "###### PART 1 - PREPARING THE DATA ######\n",
    "\n",
    "## we load the data and preprocess to prepare for the analysis \n",
    "## also find descriptive statistics for the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f301b20f",
   "metadata": {},
   "outputs": [],
   "source": [
    "### ALL IMPORTS\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import re\n",
    "import seaborn as sns\n",
    "from scipy import stats\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ada138c",
   "metadata": {},
   "outputs": [],
   "source": [
    "### directories\n",
    "\n",
    "DIR = \"/Replication_SocietalAI\" # insert your directory path here\n",
    "DATA_DIR = os.path.join(DIR, \"Data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7beab149",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(103314, 20)\n"
     ]
    }
   ],
   "source": [
    "##### Load the main file\n",
    "df = pd.read_csv(os.path.join(DATA_DIR, \"June2025_final_data.csv\"))\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9de948ef",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data shape after dropping duplicates: (103314, 20)\n",
      "Data shape after filtering short papers: (101919, 20)\n"
     ]
    }
   ],
   "source": [
    "# Drop duplicate entries based on paper_id\n",
    "df = df.drop_duplicates(subset='paper_id')\n",
    "print(f\"Data shape after dropping duplicates: {df.shape}\")\n",
    "\n",
    "# Filter out very short papers (less than 5 sentences)\n",
    "df = df[df['total_sentences'] > 4].copy()\n",
    "print(f\"Data shape after filtering short papers: {df.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "88304c56",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Descriptive Statistics for Societal Engagement (percent_societal):\n",
      "          Statistic          Value\n",
      "0             Count  101919.000000\n",
      "1              Mean       9.381599\n",
      "2           Std Dev      16.308747\n",
      "3               Min       0.000000\n",
      "4               Max     100.000000\n",
      "5            Median       1.980198\n",
      "6          Variance     265.975244\n",
      "7          Skewness       2.517782\n",
      "8          Kurtosis       6.682281\n",
      "9    5th Percentile       0.000000\n",
      "10  10th Percentile       0.000000\n",
      "11  25th Percentile       0.000000\n",
      "12  50th Percentile       1.980198\n",
      "13  75th Percentile      11.111111\n",
      "14  90th Percentile      30.303030\n",
      "15  95th Percentile      47.169811\n"
     ]
    }
   ],
   "source": [
    "####################################\n",
    "# Descriptive Statistics for Outcome\n",
    "####################################\n",
    "\n",
    "stats_societal = df['percent_societal'].describe()\n",
    "median_societal = df['percent_societal'].median()\n",
    "variance_societal = df['percent_societal'].var()\n",
    "skewness_societal = df['percent_societal'].skew()\n",
    "kurtosis_societal = df['percent_societal'].kurt()\n",
    "percentiles = [5, 10, 25, 50, 75, 90, 95]\n",
    "percent_values = np.percentile(df['percent_societal'].dropna(), percentiles)\n",
    "\n",
    "stats_df = pd.DataFrame({\n",
    "    'Statistic': ['Count', 'Mean', 'Std Dev', 'Min', 'Max', 'Median', 'Variance', 'Skewness', 'Kurtosis'] + \n",
    "                 [f'{p}th Percentile' for p in percentiles],\n",
    "    'Value': [stats_societal['count'], stats_societal['mean'], stats_societal['std'], \n",
    "              stats_societal['min'], stats_societal['max'], median_societal,\n",
    "              variance_societal, skewness_societal, kurtosis_societal] + \n",
    "             percent_values.tolist()\n",
    "})\n",
    "print(\"Descriptive Statistics for Societal Engagement (percent_societal):\")\n",
    "print(stats_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "35ad663e",
   "metadata": {},
   "outputs": [],
   "source": [
    "### rename column \"main_FOS\" to \"authors_final_FOS\"\n",
    "df.rename(columns={'main_FOS': 'authors_final_FOS'}, inplace=True)\n",
    "## rename column \"question\" to \"research_question\" and \"predicted_label\" to \"RQ_societal\"\n",
    "df.rename(columns={'question': 'RQ', 'predicted_label': 'RQ_societal'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "260c9677",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
       "columns": [
        {
         "name": "index",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "abstract",
         "rawType": "object",
         "type": "unknown"
        },
        {
         "name": "author_ids",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "citation_count",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "conclusion",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "introduction",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "authors_final_FOS",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "paper_id",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "percent_societal",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "RQ_societal",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "RQ",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "subdomain",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "title",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "venue",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "year",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "team_size",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "event_type",
         "rawType": "object",
         "type": "string"
        },
        {
         "name": "total_sentences",
         "rawType": "int64",
         "type": "integer"
        },
        {
         "name": "article_length",
         "rawType": "float64",
         "type": "float"
        },
        {
         "name": "categories",
         "rawType": "object",
         "type": "string"
        }
       ],
       "ref": "19f34f29-ff47-4b66-8af8-d57bcea9f266",
       "rows": [
        [
         "0",
         null,
         "145744305, 144494734, 2226362, 1397958221",
         "77",
         "Discussion and Conclusion We provided an overview of several clinical NLP systems under a unified architectural view. Background knowledge plays a crucial role in any clinical NLP task, and currently the UMLS is a major background knowledge component of most systems. Rule-based approaches utilizing the UMLS are still dominant in many clinical NLP systems. Rule-based NLP systems have historically achieved very good performance within specific domains and document types such as radiology reports and discharge summaries. One of the main reasons for using a rule-based approach is that rules are relatively easy to customize and adapt to new domains as well as to different types of clinical text. Earlier NLP systems such as LSP-MLP and MedLEE are comprised of “hard coded” system modules that do not facilitate reuse. The development of general frameworks such as GATE and UIMA allows sub-tasks or modules to be developed independently and integrated easily into the framework. Machine learning algorithms have been shown to significantly benefit NLP sub-tasks such as NER. Therefore, they can serve as independent modules to be integrated into a framework to improve a sub-task in a clinical NLP system. The combination of machine learning and rule-based approaches in a single hybrid NLP system often achieves better performance than systems based on a single approach. In recent years, a clear trend has developed towards creating reusable NLP modules within open source frameworks like GATE and UIMA. The main limitation of machine learning when compared to rule-based approaches is that rule-based systems do not require significant amounts of expensive, manually annotated training data, machine learning algorithms typically do. This problem is exacerbated in the biomedical domain, where suitably qualified annotators can be both hard to find and prohibitively expensive . There is an increasing trend towards building community-wide resources and tools for clinical NLP. There have been several shared tasks that bring researchers in clinical NLP together to solve, evaluate and compare different methods. Additionally, there are shared computing resources that aggregate several NLP tools to facilitate the work of researchers, such as the NLP environment in iDASH . The Online Registry of Biomedical Informatics Tools project is another platform for sharing and collaborating for biomedical researchers in order to create and maintain a software registry, in addition to knowledge bases and data sets. Applications that benefit from biomedical NLP systems, such as EMR linking to genomic information , are likely to have great utilization in the next few years. We presented here a unified overview of a few exemplary NLP systems from the architectural perspective that all these systems have two important components: background knowledge and a computational framework. How these components are constructed and integrated into pipelines for biomedical NLP is a critical determinant for their performance. Acknowledgements SD and LOM were funded in part by NIH grants U54HL108460 and UH3HL108785. References . Sager N, Friedman C, Lyman M Medical language processing: computer management of narrative data. Addison-Wesley, Reading, MA . Lindberg DA, Humphreys BL, McCray AT The Unified Medical Language System. Methods Inf Med :– . Spyns P Natural language processing in medicine: an overview. Methods Inf Med :– . Demner-Fushman D, Chapman WW, McDonald CJ What can natural language processing do for clinical decision support? J Biomed Inform :– . Friedman C Semantic text parsing for patient records. In: Chun H, Fuller S, Friedman C, Hersh W Knowl. Manag. Data Min. Biomed. Springer New York, pp – . Nadkarni PM, Ohno-Machado L, Chapman WW Natural language processing: an introduction. J Am Med Inform Assoc :– . Friedman C, Elhadad N Natural Language Processing in Health Care and Biomedicine. In: Shortliffe EH, James J. Cimino Biomed. Informatics Comput. Appl. Heal. Care Biomed. Springer London, pp – . Friedman C, Rindflesch TC, Corn M Natural language processing: state of the art and prospects for significant progress, a workshop sponsored by the National Library of Medicine. J Biomed Inform :– . McCray AT, Srinivasan S, Browne AC Lexical methods for managing variation in biomedical terminologies. Proc Annu Symp Comput Appl Med Care. pp – . Xu H, Stenner SP, Doan S, Johnson KB, Waitman LR, Denny JC MedEx: a medication information extraction system for clinical narratives. J Am Med Informatics Assoc :– . Doan S, Bastarache L, Klimkowski S, Denny JC, Xu H Integrating existing natural language processing tools for medication extraction from discharge summaries. J Am Med Informatics Assoc :– . Sager N, Lyman M, Bucknall C, Nhan N, Tick LJ Natural language processing and the representation of clinical data. J Am Med Informatics Assoc :– . Harris Z Mathematical structures of language. NY: Wiley Interscience . Harris Z A grammar of English on mathematical principles. NY: Wiley & Sons . Harris Z A theory of language and information: a mathematical approach. Oxford: Clarendon Press . Hirschman L, Puder K Restriction grammar: a Prolog implementation. In: van Canegham M, Warren D Log. Program. Its Appl. Ablex Publising Corporation, Norwood, New Jersey, pp – . Sager N, Lyman M, Nhàn NT, Tick LJ Automatic encoding into SNOMED III: a preliminary investigation. Proc Annu Symp Comput Appl Med Care – . Sager N, Lyman M, Nhàn NT, Tick LJ Medical language processing: applications to patient data representation and automatic encoding. Methods Inf Med :– . Friedman C, Anderson PO, Austin JH, Cimino JJ, Johnson SB A general natural-language processor for clinical radiology. J Am Med Informatics Assoc :– . Friedman C, Cimino JJ, Johnson SB A schema for representing medical language applied to clinical radiology. J Am Med Informatics Assoc :– . Knirsch CA, Jain NL, Pablos-Mendez A, Friedman C, Hripcsak G Respiratory isolation of tuberculosis patients using clinical guidelines and an automated clinical decision support system. Infect Control Hosp Epidemiol :– . Friedman C, Hripcsak G Natural language processing and its future in medicine. Acad Med :– . Friedman C, Shagina L, Lussier Y, Hripcsak G Automated encoding of clinical documents based on natural language processing. J Am Med Inform Assoc :– . Friedman C, Kra P, Yu H, Krauthammer M, Rzhetsky A GENIES: a natural-language processing system for the extraction of molecular pathways from journal articles. Bioinformatics Suppl :S74– . Haug P, Koehler S, Lau LM, Wang P, Rocha R, Huff S A natural language understanding system combining syntactic and semantic techniques. Proc Annu Symp Comput Appl Med Care. pp – . Haug PJ, Koehler S, Lau LM, Wang P, Rocha R, Huff SM Experience with a mixed semantic/syntactic parser. Proc Annu Symp Comput Appl Med Care. pp – . Koehler S SymText: a natural language understanding system for encoding free text medical data. University of Utah . Christensen LM, Haug PJ, Fiszman M MPLUS: a probabilistic medical language understanding system. Proc. ACL- Work. Nat. Lang. Process. Biomed. domain. pp – . Haug PJ, Christensen L, Gundersen M, Clemons B, Koehler S, Bauer K A natural language parsing system for encoding admitting diagnoses. Proc AMIA Annu Fall Symp. pp – . Fiszman M, Chapman WW, Evans SR, Haug PJ Automatic identification of pneumonia related concepts on chest x-ray reports. Proc AMIA Symp. pp – . Fiszman M, Chapman WW, Aronsky D, Evans RS, Haug PJ Automatic detection of acute bacterial pneumonia from chest X-ray reports. J Am Med Inform Assoc :– . Aronson AR Effective mapping of biomedical text to the UMLS Metathesaurus: the MetaMap program. Proc. AMIA Symp. American Medical Informatics Association, pp – . Aronson AR, Lang F-M An overview of MetaMap: historical perspective and recent advances. J Am Med Informatics Assoc :– . Shah PK, Perez-Iratxeta C, Bork P, Andrade MA Information extraction from fulltext scientific articles: where are the keywords ? BMC Bioinformatics : . Meystre SM, Thibault J, Shen S, Hurdle JF, South BR Textractor: a hybrid system for medications and reason for their prescription extraction from clinical text documents. J Am Med Inform Assoc :– . Pakhomov S, Shah N, Hanson P, Balasubramaniam S, Smith SA Automatic quality of life prediction using electronic medical records. AMIA Annu Symp Proc. pp – . Doan S, Lin K-W, Conway M, et al PhenDisco: phenotype discovery system for the database of genotypes and phenotypes. J Am Med Inform Assoc :– . Chapman WW, Bridewell W, Hanbury P, Cooper GF, Buchanan BG A simple algorithm for identifying negated findings and diseases in discharge summaries. J Biomed Inform :– . Mork JG, Bodenreider O, Demner-Fushman D, Dogan RI, Lang F-M, Lu Z, Névéol A, Peters L, Shooshan SE, Aronson AR Extracting Rx information from clinical narrative. J Am Med Inform Assoc :– . Uzuner O, Solti I, Cadag E Extracting medication information from clinical text. J Am Med Informatics Assoc :– . Zeng QT, Goryachev S, Weiss S, Sordo M, Murphy SN, Lazarus R Extracting principal diagnosis, co-morbidity and smoking status for asthma research: evaluation of a natural language processing system. BMC Med Inform Decis Mak : . Goryachev S, Sordo M, Zeng QT A suite of natural language processing tools developed for the I2B2 project. AMIA Annu Symp Proc. p . Savova GK, Masanz JJ, Ogren P V, Zheng J, Sohn S, Kipper-Schuler KC, Chute CG Mayo clinical Text Analysis and Knowledge Extraction System : architecture, component evaluation and applications. J Am Med Informatics Assoc :– . Apache Software Foundation OpenNLP. /. . Savova GK, Ogren P V, Duffy PH, Buntrock JD, Chute CG Mayo clinic NLP system for patient smoking status identification. J Am Med Informatics Assoc :– . Sohn S, Savova GK Mayo clinic smoking status classification system: extensions and improvements. Proc AMIA Proc. pp – . De Bruijn B, Cherry C, Kiritchenko S, Martin J, Zhu X Machine- learned solutions for three stages of clinical information extraction: the state of the art at i2b2 . J Am Med Informatics Assoc :– . Albright D, Lanfranchi A, Fredriksen A, et al Towards comprehensive syntactic and semantic annotations of the clinical narrative. J Am Med Informatics Assoc :– . Chapman WW, Nadkarni PM, Hirschman L, D’Avolio LW, Savova GK, Uzuner O Overcoming barriers to NLP for clinical text: the role of shared tasks and the need for additional creative solutions. J Am Med Inform Assoc :– . Ohno-Machado L, Bafna V, Boxwala AA, et al iDASH: integrating data for analysis, anonymization, and sharing. J Am Med Inform Assoc :– . Denny JC Chapter : Mining electronic health records in the genomics era. PLoS Comput Biol :e1002823",
         "Introduction In modern electronic medical records most of the clinically important data – signs and symptoms, symptom severity, disease status, etc. – is not provided in structured data fields, but are rather encoded in clinician-generated narrative text. Natural language processing provides a means of “unlocking” this important data source, converting unstructured text to structured, actionable data for use in applications for clinical decision support, quality assurance, and public health surveillance. There are currently many NLP systems that have been successfully applied to biomedical text. It is not our goal to review all of them in this chapter, but rather to provide an overview of how the field evolved from producing monolithic software built on platforms that were available at the time they were developed to contemporary component-based systems built on top of general frameworks. More importantly, the performance of these systems is tightly associated with their “ingredients” , and how these modules are combined on top of the general framework. We highlight certain systems based on their landmark status as well as on the diversity of components and frameworks they are based on. The Linguistic String Project was an early project starting in that focused on medical language processing . The project created a new schema for representing clinical text and a dictionary of medical terms in addition to addressing several key clinical NLP problems such as de-identification, parsing, mapping, and normalization. The system’s methodology and architecture have substantially influenced many subsequent clinical NLP systems. One of the main requirements for developing clinical NLP systems is a suitable biomedical knowledge resource. The Unified Medical Language System , initiated in by National Library of Medicine, is the most widely used knowledge resource in clinical NLP. The UMLS contains controlled vocabularies of biomedical concepts and provides mappings across those vocabularies. With the development of machine learning, NLP techniques, and open-source software, tools have been developed and are now available in open source . These tools can help biomedical researchers re-use and adapt NLP tools efficiently in biomedicine. Several software frameworks that facilitate the integration of different tools into a single pipeline have been developed, such as GATE and UIMA . Given the success of IBM’s Watson in the Jeopardy challenge, the UIMA framework, which was used for real-time content analysis in Watson, has now been applied widely by the biomedical NLP community. The highly recognized open source system cTAKES was the first clinical NLP system to use the UIMA framework to integrate NLP components and is rapidly evolving. / / / / / In this chapter, we provide an overview of NLP systems from a unified perspective focused on system architecture. There are already comprehensive reviews and tutorials about NLP in biomedicine. Spyns provided an overview of pre- biomedical NLP systems , while Demner-Fushman et al. more recently reviewed and summarized NLP methods and systems for clinical decision support . The use of NLP in medicine has been comprehensively reviewed by Friedman , Nadkami et al. , and more recently by Friedman and Elhadad . The review in this chapter differs from previous work in that it emphasizes the historical development of landmark clinical NLP systems, and presents each system in light of a unified system architecture. We consider that each NLP system in biomedicine contains two main components: biomedical background knowledge and a framework that integrates NLP tools. In the rest of this paper, we will first outline our model architecture for NLP systems in biomedicine, before going on to review and summarize representative NLP systems, starting with an early NLP system, LSP-MLP, and closing our discussion with the presentation of a more recent system, cTAKES. Finally, we will discuss challenges as well as trends in the development of current and future biomedical NLP systems. Materials A general architecture of an NLP system in biomedicine We start from a discussion by Friedman and Elhadad in which NLP and its various components are illustrated, as reproduced in Fig. . NLP aspects can be classified into two parts in the figure: the left part contains trained corpora, domain model, domain knowledge, and linguistic knowledge; the right part contains methods, tools, systems, and applications. From the viewpoint of system architecture, we consider a general architecture in which an NLP system contains two main components: background knowledge , which corresponds to the left part of the figure, and a framework that integrates NLP tools and modules, which corresponds to the right part of the figure. Our view of a general architecture is depicted in Fig. . Below we describe the two main components and their roles in biomedical NLP systems. Fig. . Aspects of clinical NLP systems as described by Friedman and Elhadad . The rectangles on the left side represent background knowledge, and the components on the right side represent the framework . Background knowledge and framework are the main components of an NLP system. Fig. . A general architecture of a clinical NLP system contains two main components: background knowledge and framework. Background contains ontologies, a domain model, domain knowledge, and trained corpora. Framework includes a low-level processor for tasks such as tokenization and part-of-speech tagging. A high-level processor is used for tasks such as named entity recognition and relation extraction. Tasks or modules in the framework can be dependent or independent and are organized sequentially or hierarchically. Background knowledge for NLP in biomedicine: The Unified Medical Language System As mentioned in the introduction, biomedical knowledge is an important component in building clinical NLP systems. Domain knowledge and linguistic knowledge are key elements. Earlier systems such as LSP-MLP built their own medical vocabulary and tools due to the lack of easily available resources at that time. The creation of the Unified Medical Language System , which began development in , substantially benefited clinical NLP systems. The UMLS contains three main components: the Metathesarus, the Semantic Network, and the SPECIALIST lexicon. For practical purposes, the UMLS can be considered to be like an ontology of biomedical concepts and their relations. Below we briefly summarize each component of the UMLS. The UMLS’s Metathesarus currently contains over million biomedical concepts and millions concept names originating from over controlled vocabularies in the biomedical sciences, such as ICD-, MeSH, SNOMED CT, and RxNorm. The UMLS Semantic Network provides a consistent categorization of all concepts represented in the UMLS Metathesaurus. It reduces the complexity of the Metathesaurus by grouping concepts according to semantic types. Currently it contains broad categories and relationships among categories. For example, the category “Disease or Syndrome” has a relationship “associated_with” with the category “Finding”, and the category “Hormone” has a relationship “Affects” with the category “Disease or Syndrome” in the semantic network. The UMLS SPECIALIST lexicon contains syntactic, morphological, and spelling information for biomedical terms . Currently it contains over , terms and is used by the UMLS lexical tools for NLP tasks. Background knowledge also includes domain models and trained corpora, which are used to deal with specific domains such as radiology reports, pathology reports, and discharge summaries. Annotated corpora are manually marked-up by human annotators and used to train machine learning linguistic classifiers, as well as to evaluate rule-based systems. NLP tools and integrated frameworks There are two main approaches for building NLP tools. The first is rule-based, which mainly uses dictionary look-up and rules. The second uses a machine learning approach that relies on annotated corpora to train learning algorithms. Early systems often used rule-based approach since they were relatively easy to design and implement. Currently, with the development of robust statistical machine learning methods and an increasing number of annotated corpora, many clinical NLP systems have moved away from relying exclusively on rule-based methods, although there is still a high cost in generating new annotated training data, which are still required to account for differences in tasks, types of documents, as well as their provenance. As shown in many clinical NLP challenges, machine learning methods often achieve better results than rule-based methods. However, rule-based methods are somewhat easier to customize and adapt to a new domain. Most contemporary NLP systems are hybrid, i.e., built from a combination of rule-based and machine learning methods . Fig. shows how NLP tools can be integrated into a pipeline built on top of a particular framework. By framework we mean a software platform for the control and management of components such as loading, unloading, and handling components of the pipeline. Components within a framework can be embedded and linked together or used as plug-ins. For NLP systems in biomedicine, the framework can be divided into two levels: low-level and high-level processors. Low-level processors perform foundational tasks in NLP such as sentence boundary detection, section tagging, part-of-speech tagging, and noun phrase chunking. High-level processors perform semantic level processing such as named entity recognition , relation extraction, and timeline extraction. The framework can be integrated into the NLP system itself or it can leverage available general architectures. The two most widely used general architectures are GATE and UIMA . Both consist of open-source software. GATE, written in Java, was originally developed at the University of Sheffield in and is widely used in the NLP community. It includes basic NLP tools for low-level processing packaged in a wrapper called CREOLE, and a high-level processor for named entity recognition packaged in an information extraction system called ANNIE. It can integrate available NLP tools and machine learning software such as Weka , RASP , SVM Light , and LIBSVM . Several clinical NLP systems have used GATE as their framework, such as HITEx , and caTIES for cancer text information extraction. / / / / / / / UIMA, written in Java/C++, was original developed by IBM and is part of the Apache Software Foundation software since . Its motivation is “to foster reuse of analysis components and to reduce duplication of analysis development. The pluggable architecture of UIMA allows to easily plug-in your own analysis components and combine them together with others.” The framework is best known as the foundation of IBM’s 201l Jeopardy challenge Watson system. UIMA’s functionalities are similar to GATE but are more general since UIMA can be used for analysis of audio and video data, in addition to text. There are several clinical NLP systems that use the UIMA framework such as cTAKES , MedKAT/P for extracting cancer-specific characteristics from text, and MedEx for medication extraction. System selection In order to give a unified view of system architecture, we selected representative NLP systems in this review. We selected systems due to their historical importance and influence in the biomedical NLP community. We first chose two widely influential landmark clinical NLP systems: LSP-MLP and MedLEE. LSP-MLP is a pioneering project and has greatly influenced subsequent NLP systems, and MedLEE is a system that is currently widely used in clinical NLP communities. We then selected a specific-purpose system called SymText, which was designed for radiology report processing. SymTex began development in the 1990s and is still in active use today. We also briefly review MetaMap, a widely used tool in the biomedical NLP community. We chose two systems based on GATE and UIMA: HITEx, and cTAKES, respectively. Summaries of characteristic features of the clinical NLP systems reviewed in this chapter are presented in Table . / / Table . Summary of characteristic features of some representative clinical NLP systems. System Program ming language Creator Framework Open/Closed source and License Background knowledge resource Clinical domain or source of information Encoding LSP-MLP Fortran C++ New York University Software provided by Medical Language Processing LLC corporation Developed its own medical lexicons and terminologies Progress note, clinical note, X-ray report, discharge summary SNOMED MedLEE Prolog Columbia University Closed source Commercialized by Columbia University and Health Fidelity, Inc. Developed its own medical lexicons and terminologies Radiology Mammography, discharge summary UMLS’s CUI SPRUS/ SymText/ MPLUS LISP, C++ University of Utah Closed source UMLS Radiology Concepts from findings in radiology reports ICD- MetaMap Perl, C, Java, Prolog National Library of Medicine Not open source but free available under UMLS Metathesaurus License Agreement UMLS Biomedical text Candidate and mapping concepts from UMLS UMLS’s CUI HITEx Java Harvard University GATE Open source i2b2 software license UMLS Clinical narrative Family history concept, temporal concepts, smoking status, principal diagnosis, co- morbidity, negation UMLS’s CUI cTAKES Java Mayo clinic and IBM UIMA Open source Apache . UMLS + Trained models Discharge summary, clinical note Clinical named entities , relation, co-reference, smoking status classifier, side effect annotator UMLS’s CUI and RxNorm Methods Systems Linguistic String Project – Medical Language Processor The Linguistic String Project was developed in at New York University by Sager et al. . It is one of the earliest research and development project in computer processing of natural language. The development of LSP was based on the linguistic theory of Zellig Harris: linguistic string theory, transformation analysis, and sublanguage grammar . It mainly focused on medical language processing, including the sublanguage of clinical reporting, radiograph reports, and hospital discharge summaries. The LSP approach used a parsing program to identify the syntactic relations among words in a sentence. The project strongly influenced subsequent clinical NLP projects. The LSP’s system was called the Medical Language Processor . The core component of MLP is a parser. The authors first developed a general NLP parser for the general English language domain, including English grammar and lexicon, and then they extended the system to the sublanguage of biomedicine by adding a medical lexicon and corresponding grammar. Below we summarize the main components of MLP. Background knowledge • Lexicons: MLP developed lexicons for both general English language and medical knowledge. In the lexicon, each word has an associated part-of-speech and grammatical and medical \"attributes\" called subclasses. The lexicon has possible verb objects and medical sub-classes. It also had lists of pre-defined prepositions, abbreviations, and doses. These attributes are used throughout the processing to guide the parsing and to resolve ambiguities. Pre-defined lists consist of o Standard numbers, times and dates o Medical terms o Dose strings o Organism terms o Geographic nouns o Patient nouns o Institution/ward/service nouns o Physician/staff nouns / • Grammar: The grammar is written in Backus-Naur Form . It finds grammatical structures in clinical text and contains following components: o BNF: the context-free component o The RESTR contains procedures written in the MLP's \"Restriction Language\". Those procedures test the parse tree for the presence or absence of particular features o The LISTS contains lists used in procedures other than RESTR Pipeline • The Preprocessor breaks input text into sentences. Then, the preprocessor identifies possible spelling errors, abbreviations, all forms of names of patients, staff, facilities, and administrative and geographic areas for de-identification. Numbers, units, and dates are transformed into ANSI standard format. • The MLP Parser uses a top-down, context-free grammar-based parser. The system generates multiple parses of ambiguous sentences guided by a BNF grammar. The parser was originally written in FORTRAN, and then partly converted into Prolog . Today it is written in C++. The MLP system is now publicly available through the Web site provided by Medical Language Processing, LLC - a Colorado corporation . The parser proceeds from left to right through the sentence, and top to bottom through the BNF definitions. Once the parser associates a terminal symbol of the parse tree, the attributes of the word can be tested by a restriction, for example, the agreement of subject and verb. The following steps are involved in the processing of text: • Selection passes or rejects a parse based on subtrees. • Transformation decomposes sentences into their basic canonical sentences. • Regularization connects basic canonical sentences by conjunctions. • Information format maps the syntactic parse trees into medical information structures. MLP considers information structures related to patients such as patients, family, medication, treatments, lab/test, etc. Finally, the output is written into two formats: tab delimited and XML format. / Fig. . Architecture of MedLEE, where the background knowledge contains components for the lexicon, grammar, mappings, and coding tables. The low-level processor is a preprocessor and the high-level processor consists of modules for parsing, error recovery, phrase regularization, and encoding. LSP-MLP was used for processing clinical narratives in English, and it was also extended into other languages such as French, German, and Dutch . It has been used to map clinical text into SNOMED codes . LSP-MLP was designed for information retrieval from clinical text, hence there were no reports evaluating mapping. The performance in information retrieval tasks indicated .% recall and .% precision . With its complete structures, LSP-MLP provided an early successful example for the development of subsequent NLP systems. MedLEE The MedLEE system was developed by Friedman et al. at Columbia University in . It was first designed for radiology reports, and was then extended to other domains such as discharge summaries. The system was written in Quintus Prolog. MedLEE contains two main components: a knowledge base including medical concepts, and a natural language processor. MedLee was the first NLP system used as part of a system for actual patient care and some systems in which it was embedded have been shown to improve care . It was commercialized in . The architecture of MedLEE is depicted in Fig. . Background knowledge MedLEE’s knowledge base is called the Medical Entities Dictionary , which contains a knowledge base of medical concepts and their taxonomic and semantic relations. Each concept in MED is assigned to an identifier. The MED originally contained over , concepts. Pipeline The natural language processor has three phases of processing as follows. • Phase : Parsing. Identify the structures of the text through use of a grammar. It contains three main components: a set of grammar rules, semantic patterns, and lexicon. o Grammar rules: MedLEE uses a BNF grammar, originally contained grammar rules. o Semantic classes. MedLEE considers sentences contain semantic patterns connected by conjunctions. Semantic patterns can be a word, phrase and/or belong to a semantic class. Examples of semantic classes are Bodyloc, Cfinding, and Disease. MedLEE also considers negation as a semantic pattern in its grammar. o Lexicon. The semantic lexicon originally contains both single words and phrases . • Phase : Phrase regularization. This module regularizes the output forms of phrases that are not contiguous. This is a critical step that further reduces the variety that occurs in natural language. The method is automatically applied by processing all phrasal lexical entries that begin with the symbol phrase. Phrase is used to specify that a phrase may occur in a non-contiguous variant form. • Phase : Encoding. This step maps the regularized structured forms to controlled vocabulary concepts. This process is accomplished using a knowledge base containing synonymous terms. The synonym knowledge base consists of associations between standard output forms and a controlled vocabulary. At the end of this stage of processing, the only values that remain are unique controlled vocabulary concepts. The output of MedLEE is represented as a formal model of clinical information in the domain of interest such as radiology. It has been extended to map extracted concepts into UMLS codes , and its architecture was also extended to build an information extraction system for molecular pathways from journal articles . Evaluation on randomly sentences from clinical documents achieved . Recall and . Precision compared to .-. Recall and .-. Precision for seven domain experts performing the same tasks . SPRUS/SymText/MPLUS SPRUS/SymText/MPLUS was developed in by Haug et al. at the University of Utah. It has been implemented using common LISP, the Common Lisp Object System , and C++. The original system was called SPRUS, and it evolved into SymText , NLUS , and the latest version of system, MPLUS . The system was specifically designed for processing chest radiograph reports. Background knowledge • SPECIALIST lexicon from UMLS, a synonyms database, part-of- speech lexicon. • An ATN grammar, a transformational rule base, and a set of resolution strategy rules. • Knowledge bases also contain belief network node structures, values, and training cases for each context. The context was pre- defined such as events in chest radiology reports. Pipeline SymText consists of three primary modules for the analysis and interpretation of sentences . • First, a structural analyzer generates an initial structural interpretation of a sentence. • Second, a transformational module transforms the initial structure according to the targeted semantic contexts. • Third, a resolution module semantically resolves the conceptualizations of the text according to its structure. Encoded data are the system's outputs. SymText’s outputs contain three semantic concepts: finding, disease, and appliances . The distinct feature of SymText when compared to other systems is that it uses belief networks to represent biomedical domain knowledge and discover relationships between nodes within parse trees. SymText has been used in several applications such as mapping chief complaints into ICD- codes and extracting pneumonia-related findings from chest radiograph reports . Fig. . Architecture of the MetaMap system, modified from the original , where background knowledge is based on UMLS and different modules represent the pipeline. Evaluation using chest radiograph reports to identify pneumonia-related concepts showed that the system achieved . Recall, . Precision, and . Specificity, outperforming lay persons . MPLUS was evaluated for the extraction of American College of Radiology utilization review codes from head CT reports. The system achieved . Recall, . Specificity and . Precision in identify reports as positive, i.e., containing brain findings . MetaMap MetaMap was originally developed in by Aronson at the National Library of Medicine. It was created for mapping the biomedical literature to concepts in the Unified Medical Language System Metathesaurus . It has been widely used for processing clinical text . The tool uses a variety of linguistic processes to map from text to Concept Unique Identifiers in the UMLS. It is written in Perl, C, Java and Prolog. The architecture of Metamap is depicted in Fig. . / Background knowledge The UMLS is used as the knowledge resource. Pipeline The most recent version of the system, as described by Aronson and Lang , has a two stage architecture: • Lexical/syntactic processing. o Tokenization o Part-of-Speech tagging o Lexical lookup that uses the UMLS SPECIALIST lexicon o Syntactic analysis that generates phrases for further processing • Phrasal processing. o A table lookup is used to identify variants of phrase words o Candidate identification identifies and ranks strings from the UMLS that match phrasal terms o Mapping to text through selection, combination and mapping of candidates to the text o Word sense disambiguation selects senses consistent with the surrounding text MetaMap’s output can be provided either in XML format, MetaMap Output , or human readable formats. Since its initial development MetaMap has been used in a variety of clinical text processing tasks. For example, Shah et al used it to extract cause of death from electronic health records, while Meystre et al. used it to extract medication information from the clinical record. Pakhomov et al. used MetaMap to extract Health Related Quality of Life Indicators from diabetes patients described in physician notes. Recently, Doan et al. used MetaMap for phenotype mapping in the PhenDisco system, a new information retrieval system for the National Center for Biotechnology Information’s database of genotypes and phenotypes . The MetaMap tool is highly configurable, consisting of such advanced features such as negation detection and word sense disambiguation . Although not open source, the software is freely available for free from the National Library of Medicine as a standalone command-line tool implemented primarily in Prolog. Fig. . Architecture of HITEx system, simplified from the original publication by Zeng et al . In addition to the Prolog version of MetaMap, a web-based interface is available that facilitates simple queries and also batch processing of text. Furthermore, a Java implementation of MetaMap, MMTx, is available although this version is no longer under active development. MetaMap was used by the NLM team in the i2b2 challenge on medication extraction. It achieved an F-score of ., with Precision . and Recall .. Although it ranked fourth in the challenge, it had the highest Recall among participating teams . Another system that used MetaMap, Textrator, developed by Meystre et al. was also among the top ten in that competition . HITEx HITEx is an open-source NLP system developed at Brigham and Women's Hospital and Harvard Medical School. It was built based on the GATE framework. The system leverages a set of NLP modules known as CREOLE in GATE for low-level processing, such as sentence splitting and part-of-speech tagging. Other components for high-level processor, such as a UMLS mapper and classifier, were developed as plug-in components and are easily handled for loading/reloading. The architecture of HITEx is depicted in Fig. . Background knowledge HITEx uses UMLS for background knowledge. It has trained corpora for several tasks such as for building a classifier for status-smoking status. Pipeline HITEx contains following modules integrated in the GATE framework. • The Section splitter/filter splits clinical reports into sections and assigns them to section headers. There are over section headers in HITEx. Then it filters sections based on selection criteria such as section names, etc. • The Sentence splitter breaks sections into sentences. It is based on regular-based rules. • The Sentence tokenizer breaks sentences into words, it uses an extensive set of regular expressions that define both token delimiters and special cases. • The Part-of-speech tagger assigns part-of-speech tags to each token in the sentence. This module is a rule-based POS tagger as a plug-in for the GATE framework. • The Noun phrase finder groups POS-tagged words into the noun phrases using the set of rules and the lexicon. This module is a plug- in for the GATE framework. • The UMLS mapper associates the strings of text to UMLS concepts. It uses a UMLS dictionary look-up: it first attempts to find exact matches, and when exact matches are not found it stems, normalizes, and truncates the string. • The Negation finder assigns the negation modifier to existing UMLS concepts. It used the NegEx algorithm . • The N-Gram tool extracts n-word text fragments along with their frequency from a collection of text. • The Classifier takes a smoking-related sentence to determine the smoking status of a patient. It determines one of following classes: ‘current smoker’, ‘never smoked’, ‘denies smoking’, ‘past smoker’ or ‘not mentioned’. The system has been used for the extraction of family history from discharge summaries, with accuracies of . for principal diagnosis, . for co-morbidity, and . for smoking status extraction, when excluding cases labeled \"Insufficient Data\" in the gold standard . cTAKES The cTAKES system , initiated by a Mayo-IBM collaboration in , was first released as an open source toolkit in by Savova et al. It is an open-source software system under the Apache v2. license and is widely used by multiple institutions. The system leverages NLP tools from OpenNLP with trained clinical data from Mayo Clinic. It is the first clinical NLP system to adopt UIMA as its framework. Background knowledge cTAKES used trained corpora from Mayo clinic data and other sources, utilizing the UMLS as the main background knowledge. Trained corpora were used for low-level processing such as sentence splitting and tokenizing. The UMLS was used for named entity recognition look-up. Pipeline cTAKES employs a number of rule-based and machine learning methods. The system can take inputs in plain text or in XML format. It initially included these basic components: • The Sentence boundary detector extends OpenNLP’s supervised maximum entropy sentence detection tool. • The Tokenizer breaks sentences into tokens and applies rules to create tokens for date, time, fraction, measurement, person title, range, and roman numerals. • The Normalizer maps multiple mentions of the same word that do not have the same string in the input data. It leverages the SPECIALIST NLP tools from the National of Library Medicine. • The POS tagger and the Shallow parser are wrappers around OpenNLP’s modules. • The Named Entity Recognizer uses a dictionary lookup based on noun phrase matching. The dictionary resource is from UMLS. It maps words into UMLS semantic types including diseases/disorders, signs/symptoms, procedure, anatomy and medications. After being mapped into semantic types, name entities are also mapped into UMLS’s CUIs. / / cTAKES incorporates the NegEx algorithm for detecting negation from clinical text. Since UIMA is a framework that can easily adapt to new modules, cTAKES integrates other modules such as an assertion module, a dependency parser, a constituency parser, a semantic role labeller, a co-reference resolver, a relation extractor, and a smoker status classifier. There has been considerable focus on the evaluation of cTAKES core preprocessing modules. The sentence boundary detector achieved an accuracy of ., while tokenizer accuracy was also very high at .. Both part-of-speech tagger and shallow parsing performed well, achieving accuracies of . and ., respectively. For NER, the system achieved a . F-score for exact and a . F-score for overlapping span . cTAKES was first applied to phenotype extraction studies and then was extended to identify document-level patient smoking status and patient level summarization in the first i2b2 challenge . The system was used to generate features for a state-of-the-art system in the i2b2 challenge on relation extraction of medical problems, tests, and treatments .",
         "Geography, Medicine, Computer Science, Engineering, Political Science, Physics",
         "a0c0918623392a317b944c266deacf16e186660f",
         "9.05511811023622",
         "No",
         "unified system architecture natural language processing biomedicine?",
         "NLP",
         "Natural Language Processing in Biomedicine: A Unified System Architecture Overview",
         "Methods in molecular biology",
         "2014",
         "4",
         "Workshop/Other",
         "508",
         "7737.0",
         "cs.cl"
        ],
        [
         "1",
         "In this paper we examine the usefulness of two classes of algorithms Distance Methods, Discrete Character Methods (Felsenstein and Felsenstein 2003) widely used in genetics, for predicting the family relationships among a set of related languages and therefore, diachronic language change. Applying these algorithms to the data on the numbers of shared cognates- with-change and changed as well as unchanged cognates for a group of six languages belonging to a Dravidian language sub-family given in Krishnamurti et al. (1983), we observed that the resultant phylogenetic trees are largely in agreement with the linguistic family tree constructed using the comparative method of reconstruction with only a few minor differences. Furthermore, we studied these minor differences and found that they were cases of genuine ambiguity even for a well-trained historical linguist. We evaluated the trees obtained through our experiments using a well-defined criterion and report the results here. We finally conclude that quantitative methods like the ones we examined are quite useful in predicting family relationships among languages. In addition, we conclude that a modest degree of confidence attached to the intuition that there could indeed exist a parallelism between the processes of linguistic and genetic change is not totally misplaced.",
         "None, 2883464, 2159697",
         "2",
         "Conclusion and Future Work To the best of our knowledge, this is the first time ever that methods from bioinformatics and computational biology have been applied to Dravidian language data for phylogenetic inference. On the basis of a comparison of the resulting trees with the standard tree and also the trees found in other earlier work, we seek to evaluate the performance of these methods for phylogenetic inference. The trees inferred using the different quantitative methods are largely in agreement with the linguistic facts. Character-based methods outperformed the distance-based methods going by Krishnamurti’s scoring criterion. In fact, the performance of the distance-based methods is itself quite decent. The UPGMA tree rightly rules out the wrong predictions cited by Krishnamurti. Since we ensured that the data was noise-free by using well-known data sets carefully prepared by an expert, we can unambiguously claim that the good performance of these quantitative methods is a reflection of their usefulness for the historical linguistic task of phylogenetic inference. In addition to correctly inferring the structure of the family tree, these methods have an added advantage of being able to return the branch lengths in a tree. These branch lengths can be used to calibrate the divergence times of the tree and can throw light upon the antiquity of the Dravidian language family although such dating should be done with sufficient attention to detail and more importantly, under the supervision of a trained linguist to prevent spurious dates. We could not address the issue of dating as we lack the required expertise. We hope that dates estimated using qunatitative methods such as the ones discussed in our work become a kind of starting point for a detailed investigation into the antiquity of the Dravidian language family. The results of our experiments also validate the hypothesis that Language being a natural system, the processes underlying it are the same as those underlying any other natural system and hence, there could exist a parallelism between language change and genetic change. We conclude with the remark that quantitative methods such as the ones we surveyed can reliably infer language phylogeny and certainly merit consideration in Historical linguistics. However, it must also be mentioned that the phylogenetic trees outputted by these methods must not be treated as the final word on Language phylogeny and need to be linguistically verified at every stage. In fact, we would recommend that these methods be tried out by researchers in historical linguistics on different data sets under the close C2: CVL > CLV > LV. supervision of experts so that their veracity can be better ascertained. References Andronov, M. . Lexicostatistic analysis of the chronology of disintegration of proto-Dravidian. Indo- Iranian Journal :–. Atkinson, Q.D. and R.D. Gray. . Curious parallels and curious connections: Phylogenetic thinking in biology and historical linguistics. Systematic Biology pages –. Atkinson, QD and RD Gray. . How old is the Indo-European language family? Progress or more moths to the flame. Phylogenetic Methods and the Prehistory of Languages pages –. Atkinson, Q., G. Nicholls, D. Welch, and R. Gray. . From words to dates: water into wine, mathemagic or phylogenetic inference? Transactions of the Philological Society :–. Bakker, D. . LINFER and the WALS database. In Workshop on Interpreting Typological Distributions, Leipzig. Barbançon, F., T. Warnow, S.N. Evans, D. Ringe, and L. Nakhleh. . An experimental study comparing linguistic phylogenetic reconstruction methods. Tech. rep., Technical Report , Department of Statistics, University of California, Berkeley. Bryant, D., F. Filimon, and R.D. Gray. . Untangling our past: Languages, trees, splits and networks. The Evolution of Cultural Diversity: a Phylogenetic Approach pages –. Campbell, L. . Historical linguistics: an introduction. MIT Press. D’Andrade, R.G. . U-statistic hierarchical clustering. Psychometrika :–. Dyen, I., J.B. Kruskal, and P. Black. . An Indoeuropean classification: a lexicostatistical experiment. Amer Philosophical Society. Evans, S.N., D. Ringe, and T. Warnow. . Inference of divergence times as a statistical inverse problem. Phylogenetic Methods and the Prehistory of Languages. McDonald Institute Monographs pages – . Felsenstein, J. and J. Felsenstein. . Inferring phylogenies. Sinauer Associates Sunderland, Mass., USA. Huelsenbeck, J.P., F. Ronquist, R. Nielsen, and J.P. Bollback. . Bayesian inference of phylogeny and its impact on evolutionary biology. Science :–. Kondrak, G. . Algorithms for language reconstruction. University of Toronto Toronto, Ont., Canada, Canada. Krishnamurti, B. . Areal and lexical diffusion of sound change. Language :–. Krishnamurti, B. . The Dravidian languages. Cambridge University Press. Krishnamurti, B., L. Moses, and D. Danforth. . Unchanged cognates as a criterion in linguistic subgrouping. Language :–. Nakhleh, L., D. Ringe, and T. Warnow. 2005a. Perfect phylogenetic networks: A new methodology for reconstructing the evolutionary history of natural languages. Language :–. Nakhleh, L., T. Warnow, D. Ringe, and S.N. Evans. 2005b. A comparison of phylogenetic reconstruction methods on an Indo-European dataset. Transactions of the Philological Society :–. Ringe, D., T. Warnow, and S. Evans. . Polymorphic characters in Indo-European languages. Languages and Genes, September . Ringe, D., T. Warnow, and A. Taylor. . Indo-European and computational cladistics. Transactions of the Philological Society :–. Ryder, R.J. . Grammar and Phylogenies . Saitou, N. . The neighbor-joining method: a new method for reconstructing phylogenetic trees. Swadesh, M. . Lexico-statistic dating of prehistoric ethnic contacts: with special reference to North American Indians and Eskimos. Proceedings of the American philosophical society pages –. Colophon We are much grateful to Prof. Bh. Krishnamurti for pointing out his paper to us without which we could not have had the right data to conduct our experiments. We thank the following for their proofreading and helpful comments: Prasanth Kolachina, Akshat Kumar, Sarath Chandra Addepalli and Karthik Gali.",
         "Introduction Ever since the beginning of evolutionary thought, intuitions have been galore about the relevance of the process of evolution to Language change. In the field of linguistic theory itself, the idea of “a common origin” had existed long before Darwin’s observation of ‘curious parallels’ between the processes of biological and linguistic evolution. The birth of comparative philology as a methodology is often attributed to that now very well-known observation by Sir William Jones that there existed numerous similarities between far-removed languages such as Sanskrit, Greek, Celtic, Gothic and Latin which was impossible unless they had ‘sprung from some common source, which perhaps no longer exists’. This observation also marked the birth of the Indo-European language family hypothesis. Though Jones may not have been the first to suggest a link between Sanskrit and some of the European languages, it was only after his famous remarks that explanations for the enormous synchronic diversity of language started assuming a historical character. Up until that point in linguistic theory, explanations for the similarity and therefore, the relationship between different languages had been purely taxonomic and essentially ahistorical. See Atkinson and Gray for a very interesting and comprehensive comparative study of the historical development of linguistic and biological theory. In the history of its development, the field of linguistics in general has crossed paths with biology on more than one occasion. One of the most significant interactions between these two disciplines witnessed the emergence of the biological nativist school of thought in the last century. Language came to be seen as a biological system rather than being a cultural artefact alone. Subsequent to this development but perhaps not directly related to it, Language became the object of study of a specialized area that has been referred to as Natural Signal Processing in the field of computer science. Attempts to discover the models underlying these natural systems were made as part of what was known as model-based analyses. The work that followed during this period saw a drastic cross-pollination of ideas across different domains. Computational/Quantitative methods proven to be useful in the analysis of a particular natural signal were applied to other natural signals and the results were studied. In recent times, one such attempt at cross- pollination has been the application of quantitative methods developed in the fields of Bioinformatics and Computational biology to language data. The intuition is that these methods which infer genetic phylogeny from surface gene sequence data can do so from language data too. This would amount to saying that the model underlying genetic change is similar to the one underlying language change. In this paper, we survey some of these quantitative methods and test their performance for inferring the language tree of the South-Central Dravidian sub-family from surface data. The goal of our study is to validate the use of such methods in Historical linguistics via a comparative study of the trees inferred computationally from surface data and the standard linguistic tree constructed on the basis of various phonological and morphological isoglosses. The outline of this paper is as follows. Section gives the basics and background of various terms used in bioinformatics/computational biology for inferring phylogenetic trees and their relevance in historical linguistics. Section describes the dataset used in our experiments. Sections describes the distance methods and the results of our experiments. Section describes the character based methods and the results. We discuss the trees resulting from our experiments and compare them against the linguistically constructed tree in section which is followed by conclusion and future work. Basics and Related Work The first attempt to apply quantitative methods to the historical study of language was made by the linguist Swadesh when he developed glottochronology and lexicostatistics. However, divergence time estimates for languages made using the glottochronological method have come to be seen by linguists as unreliable for a host of reasons . Although much hyped during the initial stages of its development, it was not long before historical linguists became disillusioned with this first attempt to employ quantitative methods in the study of language relationships. A widespread skepticism about the efficacy of quantitative models that predict linguistic relationships from lexical data has prevailed within the historical linguistic community ever since. However, in recent years, a new set of quantitative techniques emerged in computational biology that Lexicostatistical methods infer linguistic relationships based on the number of shared cognates among languages. The comparision is done using a basic meaning list which is supposed to be culture-free and universal and hence, resistant to borrowing and replacement. This list comprises concepts such as body parts, numerals, elements of nature, etc. The first step is to collect the commonly used words in each language for each of these universal concepts in this list. In the second step, within the set of words from all the languages corresponding to a concept, sets of possible cognates are identified. The cognacy judgements are made on the basis of systematic sound correspondences among the words of different languages. Known cases of borrowings are discarded from the list. In the third step, the distance between every pair of languages is calculated as the number of cognates shared by those two languages. By using techniques like UPGMA, a family tree for a set of languages can be constructed using all the pairwise distances. Glottochronology goes one step further and estimates the divergence times for each node in the family tree. It assumes that the rate of lexical replacement is constant for all languages at all times. This constant is called the glottochronological constant and its value was assumed to be fixed at . . Swadesh used the following formula to estimate the divergence times of the Amerindian language family where r is the glottochronological constant and c is the percentage of shared cognates. The glottochronological method has been criticised for the following reasons. First, there is a loss of information when the character-state data is converted to percentage similarity scores. Second, the case of a language having multiple or no words is not handled. Third, the assumption of a universal rate constant is disputable as the rate of evolution across languages has been observed to be variable. Fourth, the UPGMA method based on the percentage of shared cognates can produce inaccurate branch lengths and thus, lead to erroneous divergence times. In addition, the glottochronological method does not address the phenomena of reticulate evolution and parallel development at all. For these reasons, historical linguists disapprove of glottochronology as a valid method for the diachronic study of language. could infer genetic phylogeny from gene sequence data. Researchers soon realized that these methods could be applied to language data too. Languages like genetic taxa shared changed traits and language data like genetic data could be represented as state sequences. All of this resulted in a renewed interest in the application of quantitative methods to language data. The availability of data sets for well- established language families like Indo-European has spurred a number of researchers to apply these methods to these data sets and validate the resultant phylogenetic trees against the well- established linguistic facts and to test competing hypotheses. These methods are of two types: character based and distance based. We give an overview of the basic terminology in the following section. . Terminology .. Characters Language evolution can be seen as a change in some of its features. A character encodes the similarity between languages based on the values of these features and defines an equivalence relation on the set of languages L . Defining a character formally A character is a function c : L → Z where L is the set of languages and Z is the set of integers. A character can take different values across a set of languages indicating that these languages have different “states” with respect to that character. Two languages would have the same state with respect to a particular character if they have the same value for that character. The actual values of these characters are not important . Characters can either be lexical, phonological or morphological features. A lexical character corresponds to a meaning slot. For a given meaning, lexical items from different languages fall into different cognate classes which are then represented by different states for that lexical character. Two languages would have the same state if they have lexical items which are cognates. Figure shows an example of how lexical characters are represented as states. The superscript shows the state exhibited by each language for a particular meaning slot. FIGURE : An excerpt from the Dyen’s Comparative Indo-European database Morphological characters are normally inflectional markers and like lexical items, are coded based on cognation. Phonological characters are used to represent the presence or absence of a particular sound change in a set of languages. .. Homoplasy and Perfect Phylogenies Two languages can share the same state not only due to shared evolution but also due to phenomena of backmutation and parallel development . These phenomena are jointly referred to as homoplasy . For a particular character, if an already observed state reappears in the tree, then the phenomenon is called backmutation. Two languages may evolve independently in a similar fashion. In that case the two languages exhibit the same state despite evolving independently. This phenomenon is known as parallel development. Much of the related work in this area was based on the assumption of a homoplasy-free evolution . When a character evolves without homoplasy down the tree then it is said to be compatible for that tree and the tree is said to be a perfect phylogeny . Hence, everytime a character’s state changes in the tree, all the subtrees rooted at that point share the same state. Another source of ambiguity in the states of a character can be due to borrowing and is normally avoided by discarding all known cases of borrowings. . Related Work The fashion in which characters evolve down the tree is described by a model of evolution. This specification or non-specification of models of evolution broadly divides the phylogenetic inference methods into two categories. For example, methods such as Maximum Parsimony , Maximum Compatibility and Distance methods such as Neighbour Joining and UPGMA do not require an explicit model of evolution. But other statistical methods like Maximum Likehood and Bayesian Inference are parametric methods which assume a model of evolution. The parameters of the model are tree topology, branch length and the rates of variation across characters. An interesting debate is going on in the scientific community regarding the appropriateness of the assumption of a model of evolution for linguistic data . Gray and Jordan were among the first to apply the Maximum Parsimony method to Austronesian language data. They applied the technique to , lexical items from Austronesian languages and were able to get a single most parsimonious tree. The maximum parsimony method returns that tree on which the minimum number of character state changes have taken place. There are different types of parsimonies such as Wagner, Camin-Sokal, etc and each of them makes different assumptions about the character state changes. These parsimonies are discussed in detail in section . Of particular interest is the work of in which they applied Bayesian inference techniques to the Indo-European database. They used a matrix of binary values to represent the states of the languages for the lexical characters. Although their tree was identical to the tree established by the historical linguists using the comparative method , the dating based on penalised likelihood supported the famous Anatolian origin hypothesis as opposed to the Krugan hypothesis, dating the Indo-European family as being around years old. Their model assumes that the cognate sets evolve independently. They use a gamma distribution to model the variation across the cognate sets and try to find a sample of trees which best fits their data. Unlike the other non-parametric methods mentioned above, this method can handle polymorphism. By representing cognate information in terms of binary matrices , unlike glottochronology, there is no loss of information in this model. They also tested their model in scenarios where the cognacy judgements were not completely accurate and where the model misspecification could cause a bias in the estimate using an additional data set of ancient languages prepared by Ringe et al. . They further tested their model on a synthetic data set which allowed for borrowing to occur between different lineages. The model was tested against two kinds of borrowing viz- borrowing between any two lineages and borrowing between lineages which are located locally. The dating in all the above cases was largely consistent with the dating they had obtained with the Dyen’s dataset and this, they claim, demonstrates the robustness of the model. Ryder in his work used syntactic features as characters and applied the above methods for inferring the phylogenetic tree for the Indo-European language family. He also used the same techniques on various language family data sets for grouping related languages into their respective language families. The syntactic features were obtained from the WALS database . The assumption underlying the use of syntactic features is that the rate at which syntactic features can be replaced is much less than that of The position of Albanian is not resolved lexical features/items. FIGURE : An example of the binary matrix used by Atkinson and Gray . Ringe et al. proposed a computational technique called Maximum Compatibility for constructing phylogenetic trees. This technique seeks to find the tree on which the highest number of characters are compatible. Their model assumes that the lexical data is free of back mutation and parallel development . The method was applied to a data set of ancient and modern Indo-European languages. They use morphological, lexical and phonological characters to infer the phylogeny of these languages. Nakhleh et al. proposed an extension to this method known as Perfect Phylogenetic Networks which models homoplasy and borrowing explicitly. For a comparison of the various phylogenetic methods on the ancient Indo-European data, refer . They observed that almost all the methods, except UPGMA, produced trees which although partly similar, were strikingly different from one another in several ways. It must be noted that these scholars did not seek answers to much-disputed questions in the literature on the Indo-European language family tree, such as the status of Albanian in their afore- mentioned quantitative analyses. In each of the attempts discussed till now, the main thrust has been to demonstrate that language phylogeny as inferred using these quantitative methods was in almost perfect agreement with the traditional comparative method-based family tree thus demonstrating the utility of quantitative methods for the study of language change. An earlier attempt to apply quantitative methods to the Dravidian language family data was made by Andronov using glottochronology. His dating of the Dravidian language family divergences was criticised for the largely faulty data used by him as it made the dating unreliable and untenable . In other related work on the application of quantitative methods to the Dravidian language family, Krishnamurti et al. used unchanged cognates as a criterion for the subgrouping of South-Central Dravidian languages. Krishnamurti prepared a list of cognates in all the six languages which he determined would be sufficient for inferring the language tree of the family within the framework of lexical diffusion. They examined a total of rooted binary trees and scored the trees based on the number of changes for all the entries. The tree with the least score was the considered the best. Another related attempt was made by D’Andrade applying a technique called U-statistic hierarchical clustering to the same data set. In both these attempts, the trees obtained using the quantitative method/criterion were compared against the standard Dravidian language family tree constructed by Prof. Krishnamurti using the classical comparative method. We reproduce this widely-accepted standard tree here in Fig . FIGURE : Tree constructed using Comparative Method In our work, we deliberately avoided the use of lexical characters as they are noise-prone although much of the recent work in this area uses lexical characters for phylogenetic inference. Since the goal of our study was to evaluate the usefulness/performance of two classes of computational methods for historical linguistic tasks, we had to ensure that the data set used in the experiments was highly reliable for the evaluation to be accurate. This is why we chose to work with the same set of languages that Krishnamurti et al. studied. We converted the list of changed/unchanged cognates given in that paper into phonological character data by treating each cognate as a character. By comparing the trees obtained by applying the character-based methods to this phonological character data with the standard tree in Fig , we verify the usefulness of these methods for phylogenetic inference. Dataset We used two different sets of data in our experiments. The data sets contain data from six South-Central .) group of Dravidian Languages - viz. Gondi, Koṇḍa, Kui, Kuvi, Pengo, Manḍa. The first data set is the number of pair- wise shared cognates-with-change for all language pairs shown below in matrix form. The distance-based methods take this matrix as their input ). Gondi Kon ̣ d ̣ a Kui Kuvi Pengo Kon ̣ d ̣ a Kui Kuvi Pengo Mand ̣ a TABLE : Matrix of shared cognates-with-change The second data set was taken from Krishnamurti who provided a list of cognate items which were qualified for study within the lexical diffusion framework. For each of these items, we represented languages with unchanged cognate as having state ‘’ and those with changed cognates as having state ‘’. Thus we were able to represent this changed/unchanged cognate data as phonological characters with binary states. Distance Methods Distance-based methods take as input a matrix containing all the pair-wise distances for a given set of taxa and output the phylogenetic tree which explains the data. The assumption of a lexical clock may or may not be required depending upon the method. In our study we examined two such methods which are very popular in evolutionary biology and have also been widely applied to historical linguistic data. UPGMA The lexicostatistical experiment for IE languages by Dyen et al. used this method for the construction of the family tree. This method can be described by the following psuedo-code: . Find the two closest languages based on percentage of shared cognates. . Make L1,L2 siblings. . Remove one of them, say L1 from the set. . Recursively construct the tree on the remaining languages. . Make L1 the sibling of L2 in the final tree. UPGMA assumes a uniform rate of evolution throughout the tree i.e, the distance of the root node to the leaves is equal. This algorithm produces a rooted tree whose ancestor is known. Neighbour Joining Neighbour Joining is a type of agglomerative clustering developed by Saitou . It is a greedy method like UPGMA but unlike UPGMA it does not assume a uniform lexical clock. In addition, this method produces an unrooted tree along with the branch lengths which needs to be rooted for inferring the ancestral states and the divergence times for the languages. The method starts out with a star-like topology and then tries to minimize the estimate of the total length of the tree by combining together the languages that provide the maximum reduction. It has been mathematically shown that this method is statistically consistent . There are other distance-based methods such as FITSCH and KITSCH which are relatives of UPGMA and NJ but we do not discuss them in this paper. A general observation about the class of distance-based methods is that the Neighbour Joining method outperforms all other methods in this class. . Results for distance methods The tree structure in figure is the South-Central Dravidian sub-family tree constructed by Krishnamurti based on various morphological and phonological isoglosses. The similarity matrix in table is converted into a distance matrix using the formula . Figures and show the trees obtained by applying UPGMA and NJ methods to this pair-wise distance data. FIGURE : Phylogenetic tree using UPGMA FIGURE : Phylogenetic tree using Neighbour Joining Character Methods Maximum Parsimony With the exception of Bayesian analysis, Parsimonous methods have been claimed to be the most efficient for inferring the tree that is closest to the traditional standard tree . There are various types of parsimonies depending upon the number of states and the kind of transitions between the states. In our study, we limit ourselves to three kinds of parsimonies- Camin-Sokal, Wagner and Dollo parsimony. We reproduce the assumptions of each of these parsimonies as given in here. Assumptions of Camin-Sokal’s and Wagner’s parsimony . Ancestral states are known or unknown . . Different characters evolve independently. . Different lineages evolve independently. . Changes → are much more probable than changes → or equally probable . . Both these kinds of changes are a priori improbable over the evolutionary time spans involved in the differentiation of the group in question. . Other kinds of evolutionary events such as retention of polymorphism are far less probable than → changes. . Rates of evolution in different lineages are sufficiently low so that two changes in a long segment of the tree are far less probable than one change in a short segment. We also tested the effect of the hypothesis of the irreversibility of sound change by giving equal chance to change in both the directions. While Camin-Sokal parsimony corresponds to the case of sound change being irreversible, Wagner parsimony on the other hand, assigns equal probability to the occurrence of a sound change in both the directions. FIGURE : Phylogenetic tree using Wagner’s parsimony FIGURE : Phylogenetic tree using Wagner’s parsimony FIGURE : Phylogenetic tree using Camin-Sokal parsimony Assumptions of Dollo’s Parsimony . We know which state is the ancestral one . . The characters evolve independently. . Different lineages evolve independently. . The probability of a forward change is small over the evolutionary times involved. . The probability of a reversion is also small, but still far larger than the probability of a forward change, so that many reversions are easier to envisage than even one extra forward change. . Retention of polymorphism for both states is highly improbable. . The lengths of the segments of the true tree are not so unequal that two changes in a long segment are as probable as one in a short segment. Dollo’s parsimony is based on Dollo’s law which states that traits can evolve only once. In this context, the evidence of cognates which represent the process of diffusion of sound change still in process, can be treated as trait. This is equivalent to stating that sound change is homoplasy free. In other words, Sound change has diffused across the languages at a common stage in their evolution rather than occur at a later stage when the languages have diverged. This kind of parsimony also allows for determining the root of the tree. FIGURE : Phylogenetic tree using Dollo’s parsimony FIGURE : Phylogenetic tree using Dollo’s parsimony Bayesian Inference of Phylogenies This is a different class of character-based methods which is an extension of the maximum likelihood method. We used Metropolis-coupled Markov Chain Monte Carlo for sampling the posterior probabilities of the trees. This method was employed by Atkinson et al. and has been discussed in some detail in the related Work section. We experimented with different parameter settings and observed their effect on the inferred trees. We tried using two priors, one a fixed shape parameter and the other a uniform distribution. The results did not vary much when we changed the priors. MCMC runs n chains out of which n - chains are heated. A heated chain has the steady-state distribution with where T is the temperature, i is the number of the chains, π is the posterior distribution and β is the power to which the posterior probability of each heated chain is raised. The chains are heated in an incremental fashion and after each iteration, the states of two randomly picked chains i and j are swapped with the following probability Inferences or sampling is usually done on the cold chain with β = and T = . and the number of chains n = . We ran two independent analyses and the chains were kept running until the average deviation of the split frequencies between the two analyses was less than .. The first % of the analyses were thrown out as part of the burn-in. Discussion We compared the trees resulting from our experiments with the traditional tree topology given by Krishnamurti. First we discuss the trees obtained using distance-based methods which take the number of shared cognates as input. To our surprise, UPGMA which is less sophisticated outputted the tree that best matches the data in table . In his paper, Krishnamurti briefly cites the issues that were pointed out to him in the Fig . The tree makes predictions out of which are correct and are wrong. The wrong predictions are ) Kuvi should be closer to Koṇḍa than it is to Gondi. This prediction is wrong as Kuvi shares innovative items with Koṇḍa but with Gondi ) Koṇḍa should be closer to Manḍa than it is to Gondi. However, Koṇḍa shares only items with Manḍa but as many as items with Gondi which makes this prediction wrong. ) Manḍa should be closer to Koṇḍa than it is to Gondi. This last prediction also turns out to be wrong since Manḍa shares items with Gondi but only items with Gondi. All these wrong predictions are absent in the tree given by UPGMA as Gondi and Koṇḍa are placed under the same subtree. The other correct predictions were not listed by Krishnamurti and hence could not verified in the UPGMA tree. However, as the UPGMA tree excepting the placement of Gondi and Koṇḍa is identical to the tree given by Krishnamurti, there is good reason to believe that the UPGMA tree is also able to succesfully make these correct predictions. Interestingly, the other distance-based method which is the neighbour joining method infers a tree which is identical to the one obtained by Krishnamurti using two sound changes . Neighbour joining method returned an unrooted tree which we rooted treating Gondi as the outgroup. The results obtained in the next set of experiments using character-based methods on phonological character representation of unchanged cognates yielded trees which are largely in conformity with the standard tree in Fig . We employed three different kinds of parsimony and each of them outputted similar trees. Wagner’s and Dollo’s parsimonies returned the two most parsimonious trees whereas Camin-Sokal’s parsimony returned only one tree. The trees returned by Wagner’s and Dollo’s parsimonies are alike. Each of the parsimonious methods returned a tree which is identical to the standard tree constructed using the comparative method. The tree selected using Krishnamurti’s scoring criterion and the one returned by the Camin-Sokal parsimony are identical. The lowest scoring tree selected using Krishnamurti’s scoring criterion has a score of and so do all the trees returned by these various parsimonies. Both Wagner’s and Dollo’s parsimonies returned an extra tree which had a score of . The extra tree returned by Wagner’s and Dollo’s parsimonies is actually ranked as the second best using Krishnamurti’s scoring criterion. This is actually an important result as the relaxation of the irreversibility of sound change constraint gives two trees with the same score . This indicates that as far Please refer to the appendix of Krishnamurti’s paper for the sound changes Wagner’s parsimony-Sound change is equiprobable in both directions as character-based quantitative methods are concerned, the irreversibility hypothesis can be safely dispensed with. In the case of Dollo’s parsimony, the main assumption is that a new trait is very difficult to acquire but very easy to loose. In other words, it is easier for a language to revert to an older sound than to undergo a new sound change. This method also returned an extra tree which is identical to the one ranked second by Krishnamurti. In fact, Krishnamurti’s intuitive tree selection criterion developed in his work is a precursor to the various parsimony methods that have become so popular in different areas of research. Krishnamurti’s assumptions are similar to the assumptions made by Camin-Sokal parsimony. We used Camin-Sokal parsimony to score the tree returned by UPGMA and obtained a score of . Examining the trees returned by the Bayesian analysis, we observed that it essentially returns a tree identical to the one returned by the neighbour joining method but with ternary branching- Gondi, Koṇḍa and the other languages as branches. There is a general ambiguity about the grouping of Manḍa and Pengo as well as that of Kui and Kuvi. All these issues surfaced in Krishnamurti et al too where the explanation put forth in section . was that this discrepancy resulted from the very nature of the sound change C2 . Thus, we observed that cases where the trees inferred using the quantitative methods differed from the standard tree constructed using the comaparative method, were in fact, cases of genuine ambiguity even to the historical linguist. The branch lengths returned by all the methods agree upon the fact that Gondi diverged earlier than the other languages and is followed by Koṇḍa.",
         "Geography, Mathematics, Mathematics, Computer Science",
         "c2ef17bf4344a62e6a83efd98dc2c0b160efd3e5",
         "8.126410835214447",
         "No",
         "quantitative methods used accurately predict family relationships among languages historical linguistics?",
         "NLP",
         "Quantitative methods for Phylogenetic Inference in Historical Linguistics: An experimental case study of South Central Dravidian",
         "arXiv.org",
         "2014",
         "3",
         "Workshop/Other",
         "443",
         "6724.0",
         "cs.cl cs.ai"
        ],
        [
         "2",
         "In this article, we investigate the properties of phoneme N-grams across half of the world's languages. We investigate if the sizes of three different N-gram distributions of the world's language families obey a power law. Further, the N-gram distributions of language families parallel the sizes of the families, which seem to obey a power law distribution. The correlation between N-gram distributions and language family sizes improves with increasing values of N. We applied statistical tests, originally given by physicists, to test the hypothesis of power law fit to twelve different datasets. The study also raises some new questions about the use of N-gram distributions in linguistic research, which we answer by running a statistical test.",
         "1759583, 143739029",
         "0",
         "Conclusion In this paper, we tested if the language units of the three classiﬁcations obey a power law. We ﬁnd that the three datasets are not well modeled by a power law model. We then tested if the N -gram proﬁles follow a power law and observed that they actually follow a power-law with cutoff distribution. Finally, we posed two questions about the utility of N -grams for historical linguistics and found that N -grams do not pass the test. References Alstott, J. . powerlaw Python package. http: // pypi.python.org / pypi / powerlaw. Baayen, H. . A stochastic process for word frequency distributions. In Proceedings of the 29th annual meeting on Association for Computational Linguistics , pages –. Association for Computational Linguistics. Baayen, R. H. . Word frequency distributions . Kluwer Academic Publishers, Dordrecht. Campbell, L. and Poser, W. J. . Language classiﬁcation: History and method . Cambridge University Press, Cambridge. Clauset, A., Shalizi, C., and Newman, M. . Power-law distributions in empirical data. SIAM review , :–. Cysouw, M. . On the probability distribution of typological frequencies. In Proceedings of the 10th and 11th Biennial conference on The mathematics of language , MOL’ / , pages –, Berlin, Heidelberg. Springer-Verlag. Dryer, M. S. . Counting genera vs. counting languages. Linguistic Typology , :–. Dunning, T. . Accurate methods for the statistics of surprise and coincidence. Computa- tional Linguistics , :–. Evert, S. and Baroni, M. . zipfr: Word frequency distributions in r. In Proceedings of the 45th Annual Meeting of the ACL on Interactive Poster and Demonstration Sessions , pages –. Association for Computational Linguistics. Hammarström, H. . A full-scale test of the language farming dispersal hypothesis. Diachronica , :–. Haspelmath, M., Dryer, M. S., Gil, D., and Comrie, B. . WALS online . Munich: Max Planck Digital Library. http: // wals.info. Jäger, G. . Power laws and other heavy-tailed distributions in linguistic typology. Advances in Complex Systems , . Lewis, P. M., editor . Ethnologue: Languages of the World . SIL International, Dallas, TX, USA, Sixteenth edition. Maslova, E. . Meta-typological distributions. STUF-Language Typology and Universals , :–. Swadesh, M. . Towards greater accuracy in lexicostatistic dating. International Journal of American Linguistics , :–. Wichmann, S. . On the power-law distribution of language family sizes. Journal of Linguistics , :–. Wichmann, S., Müller, A., Velupillai, V., Wett, A., Brown, C. H., Molochieva, Z., Sauppe, S., Holman, E. W., Brown, P., Bishoffberger, J., Bakker, D., List, J.-M., Egorov, D., Belyaev, O., Urban, M., Mailhammer, R., Geyer, H., Beck, D., Korovina, E., Epps, P., Valenzuela, P., Grant, A., and Hammarström, H. . The ASJP database version . http: // email.eva.mpg.de / wich- mann / listss14.zip. Zipf, G. K. . The psycho-biology of language . Houghton Mifﬂin, Boston.",
         "Introduction and related work . Power laws Many real-life phenomena such as word-type frequencies in a corpus, degrees of nodes in a network representation of the internet, the number of species in a genus of mammals and populations of cities follow a power law distribution. Power law distributions seem ubiquitous in nature and many other phenomena are also claimed to obey a power law . Computational linguists will typically have come across power laws in a form popularly known as Zipf’s law . Zipf’s law is stated as f ∝ r − , where f and r is the frequency and rank of a word type x . This is a special case of the power law with the probability density function p deﬁned as p = C x − α where α = . α is the scaling parameter and C is the normalizing constant. If p is lower-bounded at x min then the power law assumes the form of p = x α − min · x − α . Power law is just one member of a larger class of distributions called large number of rare events distributions . As pointed by Evert and Baroni , LNRE distributions have applications in NLP / CL. LNRE distributions can be used to predict the total vocabulary size from a smaller sample. We now turn to a discussion of some recent work in computational historical linguistics where power laws play a central role in the argumentation. There are about languages in the world , forming more than families according to the Ethnologue , whereas more than are listed by Hammarström . A language family is a group of related languages descended from a common ancestor . Each of these language families is assigned a tree structure in at least two classiﬁcations . The size of a language family is deﬁned as the number of related languages included in the family. Wichmann observes that the frequency-rank plot of the sizes of language families seems to follow a power law. Figure 1a is plotted on a log-log scale and shows a slight deviation from the regression line in the region of and ≥ . Figure 1b shows the frequency-rank plot for Hammarström’s classiﬁcation. There is a slight deviation from the strict adherence to the straight line in Figure 1b. However, the goodness-of-ﬁt r is in the range of . and . in both the classiﬁcations. Looking into the closely related ﬁeld of linguistic typology, Maslova proposes that meta-typological distributions obey power law. A meta-typological distribution is deﬁned as the number of languages having a particular linguistic feature value, such as a particular word order or a phoneme inventory of a particular size . In response, Cysouw proposes that the distribution is actually exponential masquerading as a power law. . Testing power laws The scaling parameter α sp , α – estimated using a spreadsheet package – in Figures 1a and 1b is . and . respectively. Apart from the high r value, there seems to be no independent C is calculated by solving for R ∞ x min C x − α = for ∀ x ∈ R . In this paper, x takes up integer values only. Hammarström uses cardinal size to indicate family size. In this context, frequency denotes the family size. The data for these experiments is derived from the World Atlas of Language Structures . The data is generated by random selection of a linguistic feature value and counting the number of languages for that value. Wichmann . Hammarström . Figure : Frequency-rank plots for two different classiﬁcations along with the r and the regression-lines generated using a commonly available spreadsheet package. statistical test for the support of a power law. However, a recent paper by Clauset et al. revisited this topic and proposed a number of statistical tests for validating power law models. The authors provide a maximum likelihood estimate of the two parameters, x min and α and a method for computing the statistical signiﬁcance score of the estimates. Further, they test the superiority of the power law with respect to candidate distributions, listed in Table . In a recent paper, Jäger applied the statistical tests of Clauset et al. to test the ﬁt of the power law model to global linguistic datasets such as frequency of color terms, phonological templates for selected basic vocabulary items, and meta-typological distributions. Jäger also applied a series of statistical tests to Maslova’s data and showed that a power law with exponential cutoff describes the data better than a power-law model. name probability density function p # of parame- ters power law x α − min x − α power law with exponential cutoff λ − α Γ x − α e − λ x log-normal Æ πσ h erfc \u0010 ln x min − µ p σ \u0011i − x exp h − σ i exponential λ e λ x min e − λ x stretched exponen- tial βλ e λ x β min x β − e − λ x β gamma Γ θ k x k − e − x /θ Table : List of various candidate distributions and the number of parameters in each model. These distributions are popularly referred to as “heavy-tail” distributions. The standard method for testing a power law hypothesis consists of plotting a frequency-rank plot on a log-log scale and applying a linear regression. The linear regression boils down to determining the parameters of log p = c + α log . However, Clauset et al. warn against this. Further, they demonstrate that the value of estimated α differs largely from that derived from the regression analysis. The validity of the power law is tested through the following steps: • Estimate α sp and r using a spreadsheet package by plotting the frequency-rank plot of the data on a log-log scale. • Estimate α est and x min based on the maximum likelihood criterion . • The preference of a power law to rest of the candidate distributions is tested through a likelihood ratio test under a signiﬁcance criterion of p ≤ .. • The absolute goodness-of-ﬁt of a model is computed using the Akaike Information criterion which is deﬁned as in , m is the number of parameters and L is the goodness-of-ﬁt. The model with lowest AIC is the best ﬁt. AIC = m − 2log Jäger simpliﬁes the computation of α for discrete data by assuming a continuous approximation of the power-law model and ﬁxing x min at . . This assumption implies that all data points in a dataset completely ﬁt a power-law model. However, it can always be the case that only a part of the dataset follows a power-law model. As a necessary diversion, it is useful to know the computation of x min . The scaling parameter α is estimated by successive removal of the lowest value of x . The ﬁtted distribution is then compared to the empirical distribution through a Kolmogorov-Smirnov statistic . The value of x which minimizes D is chosen as x min . In this paper, we ﬁnd that the rank plot of the size of phoneme N -grams for language families seems to obey a power law distribution as given in Figure 2a. This ﬁnding is in parallel to that of Wichmann . By applying the statistical procedure mentioned above, we attempt to establish whether the family sizes given in three different language classiﬁcations actually obey a power law. Subsequently we test if the phoneme N -grams also obey a power law model. We describe the database in the next section. Database In this section, we describe the global linguistic database . A consortium of international scholars known as ASJP have collected reduced word lists – items from the original item Swadesh word lists , selected for maximal diachronic stability – for more than half of the world’s languages and embarked on an ambitious program for investigating automated classiﬁcation of the world’s languages. The ASJP database in many cases includes more than one word list for different varieties of a language . A word list is included into the database if it has attestations of at least of the items . Only language families with at least members are included in our experiments. This leaves a dataset with language families representing languages of the world. The names of the language families are as deﬁned in the Ethnologue . A word list might include known borrowings marked as such and these are not used in our experiments. The words in the ASJP database are transcribed using a reduced phonetic transcription known as ASJP code consisting of consonants, vowels and a symbol for nasalization, and two other ‘modiﬁers’, which are used to indicate that preceding symbols combine as single segments. All click sounds are reduced to a single click symbol and distinctions such as tones, vowel length, and stress are ignored. The computation of a phoneme N -gram proﬁle for a language family is described in section . The frequency-rank plot of the language families in the current sample is shown in Figure 2b. The regression shows a r value of . which is quite high. World map Frequency-rank plots Figure : ASJP word lists on world map and the plots with r values for families from Ethno- logue. Experiments and results All the word lists belonging to a single language family are merged together. Recall that the ASJP database can include more than one word list – representing different varieties – for a language. All the consecutive symbol sequences of length varying from – are extracted and the size of the N -gram proﬁle is deﬁned as the total number of unique – N -grams obtained through this procedure. Thus, a -gram proﬁle consists of all the phoneme -, - and -grams. The size of the -, - and -gram proﬁles for each of the language families as deﬁned in the Ethnologue is given in Table . In effect, an N -gram proﬁle is the sum of all the n -gram types leading up to N . As evident from right panel of Figure , each of the N -gram proﬁles, N ≥ , seem to follow a power law. When a power law regression is applied to each of the frequency-rank plots, the goodness-of-ﬁt r is . , . and . for -grams, -grams and -grams respectively. The r value of both -grams and -grams is quite low, only . and . when compared to the r value of the number of languages, . . We also plot the frequency-rank plots for each n -gram type in the left panel of Figure . The r values are quite high and are . , . and . respectively. The r scores in Figure 1a for – grams are very high and fall within the range of the correlation of . , reported by Wichmann . As we have shown above, the correlation of N -gram distribution to language family size improves with increasing N . This is a kind of behavior familiar from corpus studies of word distributions , where closed-class items – typically function words – yield distributions similar to the -grams in this study, whereas open-class words display typical power-law behavior for all corpus sizes, just like the –-grams in this study. We take this as an indication that we are on the right track, investigating a genuine linguistic phenomenon. We also test if the genus size across the world’s languages displays a power-law like behavior. A genus is a language classiﬁcation unit which contains related languages and is estimated to be – years old. The genus level was originally introduced by Dryer . We use the genus information given in ASJP database. The current dataset has genera and word lists. Table shows the results of the application of the Figure : N-gram proﬁles ﬁtted to a power law. Language family NOL -gram -gram -gram Language family NOL -gram -gram -gram Austronesian Macro-Ge Niger-Congo Sepik Trans-NewGuinea Tai-Kadai Afro-Asiatic Chibchan Australian WestPapuan Indo-European EasternTrans-Fly Nilo-Saharan Dravidian Sino-Tibetan LakesPlain Arawakan Border Austro-Asiatic South-CentralPapuan Oto-Manguean Penutian Uto-Aztecan Panoan Altaic Witotoan Salishan Hokan Algic Quechuan Tupi Siouan Torricelli Na-Dene Mayan Hmong-Mien Tucanoan Totonacan Ramu-LowerSepik Khoisan Carib Sko NorthCaucasian Mixe-Zoque Uralic Table : The number of languages , -gram, -gram and -gram proﬁles for language families. statistical tests to different datasets. Judging by AIC, none of the classiﬁcation unit datasets follow a power-law distribution. It is important to notice that the α est widely differs from α sp . As demonstrated by Clauset et al. , there can be a large difference when estimating α for small datasets of size ≤ . Only the − -gram proﬁles follow a power-law with cutoff model ascertained by the lowest AIC value. Interestingly, n tail values are highest for n = followed by and . The value of α for a power law is typically between and . The values of α est for N -gram proﬁle also lie between and . The AIC values in Table suggest that the power-law with cutoff is a better model than power- law for for N -gram proﬁles. We assess this superiority through a likelihood ratio test. The results Data x min ln n tail α est PL α _ sp PLWC LN exp str exp Hammarström -. . . . . . . . . ASJP -. . . . . . . . . WALS genus -. . . . . . . . . -grams -. . . . . . . . . -grams -. . . . . . . . . -grams -. . . . . . . . . -grams -. . . . . . . . . -grams -. . . . . . . . . -grams† -. . . . . . . . . -grams † -. . . . . . . . . -grams† -. . . . . . . . . -grams † -. . . . . . . . . Table : The ﬁrst three rows correspond to the language size data of Hammarström , ASJP data and genera sizes. Columns – correspond to the estimated parameters in a power law. Column shows the AIC value for a power law. Column shows the α sp estimated by a standard spreadsheet package. The remaining columns correspond to the AIC values for the other candidate distributions. The last four rows show the ﬁt of each n -gram proﬁle leading upto and are indicated by a † . For each dataset, the best ﬁt model is indicated in bold . Here, the common factor is not included in the AIC computation. All the above results are computed using the power-law python package . Data PLWC LN exp str exp Hammarström − . . . − . . ASJP − . − . . − . − . genus − . − . . − . − . -grams − . − . . − . − . -grams − . . . − . − . -grams − . . . − . . -grams − . − . . − . − . -grams − . − . . − . − . -grams† − . . . − . . -grams† − . . . − . . -grams† − . . . − . . -grams† − . . . − . . Table : The table shows the results of the likelihood-ratio test for comparing the power law with the rest of candidate distributions. The − sign indicates the test favoring the candidate model than the power law model. Each number is the p -value and the signiﬁcance is indicated in bold . are given in Table . The results suggest that the PLWC is a better ﬁt than PL at a signiﬁcance criterion p ≤ . . Interestingly, none of the family-size datasets are genuinely power-lawish. They seem to belong to other “heavy-tailed” distributions. Incidentally, Hammarström’s dataset – covering more than languages – ﬁts better to a stretched exponential model than a power-law distribution. Even though this study shows that phoneme N -gram proﬁles closely mirror the power-law-with- cutoff behavior, it raises more questions than it answers about the use of N -gram distributions in linguistic research, such as: Q. Is the N -gram distribution an effect strictly connected with genetic relatedness among the languages, or simply an effect of the number of languages in a group ? A. We answer this question through the following procedure: . For a family size s , make a random sample of languages of size s . . Compute the N -gram proﬁles. Repeat steps − for all family sizes and plot the N -gram proﬁle sizes. The results are shown in Figure 4a. All the r values are in the range of . to . . This experiment suggests that the N -gram distribution is related to genetic relatedness and not an effect of a sample size. Q. If the effect is genetic, can the size of the family be predicted from N -gram proﬁles of smaller samples than the full family? A. We answer this question through the following procedure: . For a family of size s , create a random language sample of size i , where ≤ i ≤ s . . Compute the N -gram proﬁles for each random sample. . Repeat steps − for iterations and compute the average size of a N -gram proﬁle for each i . Repeat the steps − for all i . The results of this experiment for s = is shown in Figure 4b. Figure 4b shows the plot for the average number of N -gram types vs. the size of language sample. All N -gram curves seems to be increasing monotonically and not stabilizing after a particular sample size. Only -grams and -grams tend to stabilize with respect to sample size. The N -gram curves for other language families also follow the same trend. These results suggest that the N -grams of smaller samples cannot be used to predict the full family size. Random samples. Average N -gram proﬁle size",
         "Geography, Computer Science, Political Science, Philosophy",
         "b1cee2480e3ce3fefe40fd29640fa14d554259bf",
         "1.6835016835016834",
         "No",
         "phoneme n-grams across different language families exhibit power law distributions correlate language family sizes?",
         "NLP",
         "Properties of phoneme N -grams across the world's language families",
         "arXiv.org",
         "2014",
         "2",
         "Workshop/Other",
         "297",
         "4097.0",
         "cs.cl stat.co"
        ],
        [
         "3",
         "Spoken Language Systems at Saarland University (LSV) participated this year with 5 runs at the TAC KBP English slot filling track. Effective algorithms for all parts of the pipeline, from document retrieval to relation prediction and response post-processing, are bundled in a modular end-to-end relation extraction system called RelationFactory. The main run solely focuses on shallow techniques and achieved significant improvements over LSV's last year's system, while using the same training data and patterns. Improvements mainly have been obtained by a feature representation focusing on surface skip n-grams and improved scoring for extracted distant supervision patterns. Important factors for effective extraction are the training and tuning scheme for distant supervision classifiers, and the query expansion by a translation model based on Wikipedia links. In the TAC KBP 2013 English Slotfilling evaluation, the submitted main run of the LSV RelationFactory system achieved the top-ranked F1-score of 37.3%.",
         "143862204, 33477621, 144036063, 40157771, 2561225",
         "59",
         "Conclusion The LSV English slot ﬁlling system Re- lationFactory is a distant supervision system for query-based relation extraction. It is based on query expansion by anchor text translations, hand-crafted seed patterns and two distant su- pervision components: one modeling relation prediction as a classiﬁcation task using Sup- port Vector Machines and shallow features; the other scoring surface patterns by a combination of generative and discriminative distant supervi- sion noise reduction models. A detailed analysis showed that each of the aforementioned com- ponents contributed to the overall good perfor- mance of the system. Other components that were not included in the main run, such as hand- written dependency patterns or validation by answers found in a Wikipedia text corpus, could not improve on the results achieved with these basic components. Acknowledgements Benjamin Roth is a recipient of the Google Eu- rope Fellowship in Natural Language Process- ing, and this research is supported in part by this Google Fellowship. Tassilo Barth was supported in part by IARPA contract number W911NF- -C- and Michael Wiegand by the Ger- man Federal Ministry of Education and Re- search under grant no. “01IC10S01”. References Razvan C Bunescu and Raymond Mooney. Learning to extract relations from the web us- ing minimal supervision. In Annual meeting- association for Computational Linguistics , volume , page , . Yee Seng Chan and Dan Roth. Exploiting syntactico-semantic structures for relation ex- traction. In ACL , pages –, . Grzegorz ChrupaBla and Dietrich Klakow. A Named Entity Labeler for German: exploiting Wikipedia and distributional clusters. In Pro- ceedings of the Conference on International Language Resources and Evaluation , pages –, . Thorsten Joachims. Making large scale svm learning practical. . Yan Li, Sijia Chen, Zhihua Zhou, Jie Yin, Hao Luo, Liyin Hong, Weiran Xu, Guang Chen, and Jun Guo. Pris at tac2012 kbp track. In Proceedings of the Text Analysis Conference , . Ryan T McDonald and Joakim Nivre. Charac- terizing the errors of data-driven dependency parsing models. In EMNLP-CoNLL , pages –, . Bonan Min, Xiang Li, Ralph Grishman, and Ang Sun. New york university system for kbp slot ﬁlling. In Proceedings of the Text Analysis Conference , . Soumya Ray and Mark Craven. Supervised ver- sus multiple instance learning: An empirical comparison. In Proceedings of the 22nd in- ternational conference on Machine learning , pages –. ACM, . Benjamin Roth and Dietrich Klakow. Cross- language retrieval using link-based language models. In Proceeding of the 33rd interna- tional ACM SIGIR conference on Research and development in information retrieval , pages –. ACM, . Benjamin Roth and Dietrich Klakow. Com- bining generative and discriminative model scores for distant supervision. In Proceedings of the Conference on Empirical Methods in Natural Language Processing . As- sociation for Computational Linguistics, . Benjamin Roth, Grzegorz Chrupala, Michael Wiegand, Mittul Singh, and Dietrich Klakow. Generalizing from freebase and patterns using distant supervision for slot ﬁlling. In Proceed- ings of the Text Analysis Conference , . Benjamin Roth, Tassilo Barth, Michael Wie- gand, and Dietrich Klakow. A survey of noise reduction methods for distant supervision. In Proceedings of the workshop on Auto- mated knowledge base construction , AKBC ’, pages –. ACM, .",
         "Introduction The English slot ﬁlling task of TAC KBP re- quires participants to extract relational informa- tion about query entities of type person or orga- nization from a large text corpus. At the center of the TAC KBP slot ﬁlling task lies the relation detection task, however, steps like document retrieval, ﬁnding and disambiguating potential query or answer matches can also have a signif- icant impact on performance. Since TAC KBP slot ﬁlling is formulated by stating a well-deﬁned information need, it is designed to shed light into which approaches and steps in a pipeline are most beneﬁcial to solving a query-driven re- lational extraction task. The Spoken Language Systems at Saarland University slot-ﬁlling system Rela- tionFactory is based on the architecture of the LSV slot ﬁlling system . RelationFactory is based on the driving princi- ples of developing a modular and easily extensi- ble distant supervision relation extractor, mak- ing use of shallow textual representations and features. In this paper, we give an overview of the general architecture of the system, and de- scribe novel components and additional evalu- ations. We will only brieﬂy sketch the compo- nents already described in Roth et al. , to which the interested reader is referred. The paper is structured as follows: Section describes the general system design and dis- cusses each single component in turn. Section discusses the results obtained and provides a detailed per-component evaluation. We give a brief discussion and conclusion in Sections and . Pipeline and Component Descriptions The workﬂow and architecture of RelationFac- tory as illustrated in Figure is based on our last year’s system . Beyond the changes that we implemented to account for the new requirements in the task deﬁni- tions , we improved the perfor- mance of the distant supervision SVM classi- ﬁer, included a distant supervision-based pat- tern module in the main run, and included experimental modules in additional runs. Our pipeline is a two-stage pipeline with a can- didate generation stage and a candidate validation stage . The candidate validation stage consists of several modules that decide whether a candidate indeed ex- presses the relation or not. We used the same training data and hand- crafted surface patterns as in Roth et al. , all improvements with respect to last years’ sys- tem are due to advances on the modeling and al- gorithmic side. The main improvements, which are included in the main run, stem from an improved and consolidated feature representa- tion and a recently developed distant supervi- sion pattern scoring scheme. . Query Expansion, Retrieval and Candidate Matching The name of a TAC KBP query entity that is provided is expanded by a translation model based on Wikipedia anchor text, inspired by work on cross-language information retrieval . The advantage of us- ing anchor text rather than e.g. Wikipedia redi- rects is that anchor text captures a wide range of variations, as they occur in actual sentences. In order to avoid translations to surface forms that mainly denote other entities, we only retain query expansions for which the most frequently co-occurring Wikipedia page is the same as for the original query name . Example output for this type of query expansion is displayed in Figure . For queries of type organization , additional expansions are generated by augmenting the original name by common suﬃxes of business forms . For queries of type person , additional ex- pansions are obtained by adding the last name of a person only . We use the expan- sions for retrieval and matching directly and do not use any other entity linking or disambigua- tion strategies as entity linking is not the focus of this work. We retrieve documents by using the original query name and a query expansion, selected by high pointwise mutual information w.r.t. the query. Sentences are tagged using a state- of-the sequence perceptron named-entity tag- ger ; non-standard named-entity types relevant for certain relations are matched using lists of respective Freebase types. Sentences where a query or one of its ex- pansions matches, and that have a named-entity tag of a potential slot ﬁller type, are passed to the prediction components for validation. More details on query name expansion, retrieval and candidate matching can be found in last years’ system description . . Distant Supervision SVM Classiﬁers The most important candidate validation com- ponent , both in terms of stand- alone F1-score, as well as F1 contribution in the ablation analysis, is the set of distantly super- vised per-relation SVM classiﬁers. Original query Wikipedia link anchor text expansions Per: last name / Org: suﬃxes Ali Akbar Khan Utd. Ali Akbar Khan, Ustad Ali Akbar Khan Khan Adam Gadahn Azzam the American, Adam Yahiye Gadahn Gadahn Nancy Kissel Murder of Robert Kissel, Robert Kissel Kissel DCNS Direction des Constructions Navales, DCN, ... DCNS Ltd, DCNS Corp, ... STX Finland Kvaerner Masa Yards, Aker Finnyards, ... STX Finland Ltd, ... Figure : Examples of query expansions. The expansion of Nancy Kissel is an example of a wrong expansion to thematically related entities. The vast majority of query expansions is, however, beneﬁcial. .. Training Data The training data is the same as used in Roth et al. . Distant supervision argument pairs are obtained by mapping Freebase relations to TAC relations and by matching hand-crafted seed patterns against the TAC text collec- tions. This way we obtain two sets of seed pairs. We use a maximum of k argument pairs per relation for each of the two sets of seed pairs. These pairs are then matched against the TAC text corpora, and a maximum of sen- tences per pair are used as training data. .. Feature Set In comparison to last years’ system we use a rather minimalistic feature set. We do not in- clude most of the previously used features but only model context with token n-gram-based features. When us- ing token n-grams, we found it essential to mark whether the query or the slot ﬁller comes ﬁrst. Additionally, in- cluding sparse n-grams, where tokens in the middle of the n-gram were wildcarded, in- creased performance. For the context between ARG1 and ARG2 , we use n-grams up to length and skip-n-grams of length and . We model the left and right context outside the arguments with n-grams up to length . Figure shows examples of extracted features for a candidate instance. .. Aggregate Training and Parameter Tuning We train one binary support vector machine for each of the relations using the distant su- pervision matches for that relation as positive data and the matching contexts for all other re- lations as negative data. We use SVM light as the classiﬁcation toolkit. Two eﬀective mech- anisms to increase distant supervision training performance are employed: We refer to the ﬁrst as aggregate training , the second is global param- eter tuning . Aggregate training. Distant supervision training data contains noisy false positive sen- tence matches of pairs in the knowledge base. Min et al. approach this problem by a method called label-reﬁnement: the matching sentences are taken as instances and a classiﬁer model is trained on them. In a second iteration, the training data is classiﬁed by this model, and the ﬁnal model is trained using the resulting reﬁned labels. This relabeling should reduce ambiguity in the training data, and en- force common patterns for a relation found in the ﬁrst iteration. In our experiments, we found a another ex- tremely simple method, which we call aggre- gate training , to be more eﬀective. Here, in- stead of treating each distant supervision match as a single training example, we If the same feature vector happens to occur more than once in the training data, and is labeled both as positive and negative, those instances of the feature vec- tor which are labeled as negative are removed from the training data. / , Joachims Relation: per:origin Candidate sentence: One Pakistani intelligence oﬃcial said he is Adam Gadahn, a California native and the ﬁrst U.S. citizen to be charged with treason in years. Feature examples: BETWEEN NGRAM#ARG1 > #, > OUTSIDE NGRAM#ARG2 > #citizen > #to > SKIP NGRAM#native > ###ﬁrst > Figure : Examples of extracted features. Each feature is ﬁrst marked with the feature group it belongs to , followed by the token sequence of the n-gram, using # as a separator. Each token is marked to indicate whether the slot ﬁller comes left or right of the query. group all sentences per entity pair, extract the features, sum the feature counts of all these sen- tences and normalize the feature vector for that pair so that the highest feature has weight . . This scheme greatly increases training speed. Moreover, it mitigates a potential skew in the training data by features that are highly cor- related with a distant supervision pair , but not with the respective re- lation , which might lead the clas- siﬁer to model frequently matching argument pairs rather than generalizing to the relation. More investigations into the eﬀectiveness of this training scheme are left for future work. Global parameter tuning. Tuning the cost-factor by which training errors on posi- tive examples outweigh errors on negative ex- amples has often been observed as crucial to perfor- mance. Moreover, experimental results suggest that simple misclassiﬁcation cost tuning is supe- rior to multi-instance learning in many settings including relation ex- traction . We therefore trained three SVM conﬁgura- tions for each relation by setting the j -parameter to . , . and . , respectively. We found that the best local parameter choice does not necessarily correspond to an op- timal global F1-score: For ex- ample, for relations with a low precision over the whole recall range , increasing the individual F1-score by increasing recall may have a nega- tive overall eﬀect. Likewise, for relations with an above average precision, it may be beneﬁcial for overall performance to score more instances as positive than tuning for individual F1-score may result in. To avoid these problems that arise by individ- ually maximizing per-relation F1-scores, we use a greedy procedure to tune the per-relation j - parameters in order to optimize global F1-score, instead. Algorithm shows the pseudo-code of the global parameter-tuning. We use R to de- note the set of relations, j a choice of param- eter for a particular relation r ∈R , evaluate a function returning the global F1-score for the current choices of j , and evaluate\\j) the global F1-score with a particular j replaced by j . The parameters are tuned with respect to performance on earlier TAC KBP slot ﬁlling queries . .. Prediction While training is done on an aggregate level, prediction is done on each candidate sentence independently. The per-sentence prediction is motivated as in TAC KBP, the task is not to ﬁnd pairs that likely belong into the knowledge base , but to ﬁnd pairs that justiﬁably belong into the knowledge base . An answer is returned if at least one Algorithm Global parameter tuning. The second loop over the relations can be executed iteratively . for r ∈R do j ← . f ←evaluate for r ∈R do for j ∈{ . , . , . } do ˆ f ←evaluate \\j ) if ˆ f > f then f ← ˆ f j ← j candidate sentence with it is classiﬁed as true . . Distant Supervision Patterns As a second distant supervision component be- sides the SVM classiﬁers, we include simple in- tertext patterns that are scored according to frequency in the distant supervision data, and two combined noise reduction methods to sup- press the inﬂuence of false positive matches . The pattern scoring follows the method of Roth and Klakow and combines for each pattern the count of the respective relational topic on the training data n ) and the score of a discriminatively trained Percep- tron model P . The overall scoring func- tion further includes the relative frequency of a pattern for a speciﬁc relation n n and the perceptron probability P of the pat- tern to express no relation. It is denoted by: . · n ) n + . · n · P n · + P ) The scoring function provides scores in the in- terval between . and . . We use the same global parameter tuning method as for the dis- tant supervision SVM classiﬁers to ﬁnd score thresholds on the intertext patterns . We tune thresholds on the score levels . , . , . , . and . . . Inﬂuence of Hand-Crafted Patterns In TAC KBP, the task is deﬁned by a human readable task description, mostly independent of restrictions on the kind of methods to be used. The mapping of that task description into an automatic system always requires human eﬀort. The most popular approaches to capture that human translation step is by providing hand- crafted patterns or manually establishing map- pings to knowledge-bases such as Wikipedia in- foboxes or Freebase. In order to keep the eﬀort to pattern writ- ing minimal, in our system we restricted the patterns to plain sequences of tokens with a general placeholder and did not use syntactic patterns that would require linguistic expertise. We use the same patterns as in our system . Although we found it generally to be less eﬀort to write down a few token sequences than to identify the cor- responding relational correspondence in Free- base , it is interesting to quantify the inﬂuence of the hand-crafted patterns in our sys- tem. We therefore compare the performance of our hand-written patterns to the reported scores of hand-written patterns in the NYU system . In the NYU system there are three modules with dedicated hand- crafted patterns: A so-called local patterns mod- ule, that includes short patterns similar to ours, and two bootstrapped patterns modules that take additional dedicated hand-crafted patterns as an input and iteratively add new patterns, based on corpus co-occurrences. The NYU pattern bootstrapping modules use hand-crafted pat- terns both based on token sequence and syn- tactic paths. Table shows the performance of the NYU hand-written pattern modules and our hand- written pattern module for the TAC task. System / Pattern Component Precision Recall F1-Score NYU / local patterns . . . NYU / bootstrapped linear . . . NYU / bootstrapped dependency . . . LSV / token sequence . . . Table : Comparison of the NYU hand-crafted pattern modules and the hand-crafted pattern component used in our system , on the task. For the LSV system we give the exact evaluation of the system, and in brackets the anydoc and lowercase evaluation of the currently used system. It should be noted that performance of a partic- ular module is also aﬀected by other factors such as retrieval, argument matching and postpro- cessing. The performance of the hand-written patterns in our system roughly corresponds to that of the NYU hand-written local patterns component. . Alternate Names Entity pairs of the relation alternate names can be predicted by any of the validation components such as the SVM classiﬁer or a pattern matcher. Additionally, we in- clude a dedicated component that explicitly returns a slot ﬁller for per:alternate names or org:alternate names if an expression re- turned by our query expansion matches in one of the retrieved documents. . Postprocessing and Redundancy Removal Postprocessing and redundancy removal are based on mapping the answers to normal forms based on Wikipedia link translations and lower- casing as in the system . Additionally, due to the changed task descrip- tion for the per:title relation, we included job titles multiple times if they co-occurred with diﬀerent organization names, and the co- anydoc is an option in the TAC scorer for scoring in- dependently of the reported document id, lowercase is an option for case insensitive scoring. When running the current systems the relations per:employee of and per:member of are merged both for prediction and eval- uation. occurrence was licensed by a pattern. However, the special treatment of per:title decreased overall performance. We do not use any cut-oﬀ on the number of returned answers per slot. . Modules not Included in the Main Run PRIS Syntactic Patterns. We implemented a module to match the dependency patterns provided by the PRIS team . Thus we wanted to test whether dependency patterns may help to improve performance in our pipeline. Due to the many degrees of free- dom to incorporate those patterns into a relation extraction system, we cannot guarantee that our module makes the best use of the provided patterns. Wikipedia-Based Validator. This module runs the relation extraction pipeline on an ad- ditional Wikipedia text dump and uses the slot ﬁllers obtained this way to validate candidates retrieved from the TAC corpora. Results and Evaluation . Submitted Runs Table gives an overview over the submitted runs. They are characterized as follows: • lsv1 : In this run, only fast validation components are used, this means especially no syntactic analysis and no query-speciﬁc analysis of an additional Wikipedia dump. The fast components are The list of patterns was compiled from high- frequency context patterns between entities of type and . run id run type P R F1 lsv1 fast . . . lsv2 precision . . . lsv3 all . . . lsv4 recall . . . lsv5 all shallow . . . Table : Oﬃcial scores on runs submitted by team LSV. the SVM classiﬁer, the distant supervision patterns, the hand-written patterns, and the alternate names expansion module. • lsv2: Only modules are included that pro- duced high precision on the develop- ment data. This includes most components of lsv1 , but not the SVM classiﬁer. Addi- tionally the syntactic patterns are included in this run. • lsv3: This contains all validation compo- nents with standard conﬁguration. It in- cludes all components from lsv1 and lsv2 , and the Wikipedia-based validator. • lsv4: This is a high-recall run. In addition to the components of lsv3 , the entity expan- sion is relaxed , and per:employee or member of slots are inferred from predicted per:title slots . • lsv5: This is a run that exclusively com- prises shallow components . It corresponds to lsv1 to- gether with the Wikipedia-based validator. Interestingly, the fast run , that only ex- tracts surface-level features and matches linear patterns, is the best performing in terms of F score. Increasing the precision by concentrating on high-precision modules as well as increasing recall by merging responses from more modules did not have an overall positive eﬀect. It re- mains for future work to analyze whether addi- tional improvements can be achieved by a more principled module combination scheme . . Single Component Analysis and Ablation Analysis Table shows the performance of the single com- ponents and the merge of their responses. In order to show how complementary single com- ponents are with respect to the other compo- nents, Table gives an ablation analysis on the best-performing run . Some observations on the performance of single components: • Alternate Names. The inferred alternate names slot ﬁllers from the query expansion are high-precision. Al- though concerned with only two relations, this component gives a F1 gain of . % on top of the other components. • The hand-crafted patterns provide high- precision responses, but have relatively low recall for a component modeling all rela- tions. They are considerably complemen- tary w.r.t. the other components . • Distsup patterns. The patterns induced from the distant supervision data provide good-precision responses with good recall. They capture information not modeled by either the SVM classiﬁers or the hand- crafted patterns . • PRIS syntactic patterns. The depen- dency patterns have good precision, but are slightly behind plain surface patterns in our experiments. • Distsup SVM classiﬁer. The SVM clas- siﬁers are the strongest relation validation component in our system, both in terms of single performance as well as complemen- tary F1 gain . • Wikipedia-based validator. This is the component with the lowest precision, since apart from candidate generation , as well as the F1 gain contributed by the respective component on top of the other components. Components are sorted by complementary F1 gain. matching, tagging) only overlap with an- swers from Wikipedia is checked. It is inter- esting to note that while this component ob- tained high precision in our internal devel- opment benchmarks, precision was rather low for the oﬃcial submitted run. • Inferred per:title aﬃliations. In- ferring per:employee or member of from predicted per:title relations had a min- imal eﬀect on the precision/recall ratio. • Query expansion and Relaxed query expansion. It is important to note that query expansion has a high eﬀect on over- all performance, contributing a F1 gain of . %. This is due to the greatly positive eﬀect on recall, while almost not harming precision. Query expansion plays a role in both document retrieval and query match- ing. It seems necessary not to overgener- ate, as predicting more ambiguous aliases increased recall but had negative eﬀect on F1-score. • Redundancy removal. Removing redun- dant slot ﬁllers using Wikipedia anchor text had a slightly beneﬁcial eﬀect on overall F1. • Multiple per:title s. On the other hand, trying to cluster predicted per:title s by their aﬃliations was detri- mental to performance. Discussion: Shallow vs. Deep Analysis In our main run no deep linguistic analysis, such as dependency parsing, was used. Merely named-entity tagging was used to identify slot In development, the Wikipedia-based validator achieved an ‘anydoc’ precision of % on data. It is to be expected that evaluation independently of the docu- ment id results in higher scores – however, the Wikipedia validator was the only module with such a big mismatch between development and submission scores. The components related to post-processing, redun- dancy removal and multiple per:title s are part of every run in Table and therefore only separately evaluated in the ablation study . ﬁller candidates – all the features and patterns operate directly on the surface level. When de- veloping the RelationFactory KBP system, we kept experimenting with more linguistically mo- tivated representations but found that they did not provide any gain over represen- tations derived directly from the surface forms. While syntactic representations provide a metaphor every researcher in the ﬁeld is accustomed to when speaking about relational representations, our observations suggest that taking one step back from the dependency view may clear the sight to more central aspects of certain information extraction tasks. Apart from purely practical advantages of a shallow approach , there are also more considerations: • Contextual cues. Words or word se- quences that do not express the relation but provide topical information and may dis- ambiguate a relational expression are nat- urally included in a shallow feature repre- sentation. A dependency analysis, however, aims to strip oﬀthose cues. • Micro-structures without content words. Chan and Roth observe that in ACE % of the mention pairs in a relation do fall in a pattern type where the relation is not explicitly expressed by a content word. The four pattern types they identify are Premodiﬁer , Possessive , Preposition and Formulaic . • Parsing errors. While syntactic parses may be accurate for short distance depen- dencies, which also can be easily captured by surface patterns, for longer distances the dependency accuracy signiﬁcantly de- creases .",
         "Mathematics, Computer Science, Computer Science, Engineering, Computer Science, Medicine, Computer Science, Political Science, Art",
         "4062a185b0aeb76626b13a59814128dd2a26ddc3",
         "0.0",
         "No",
         "shallow distant supervision methods effective slot filling spoken language systems?",
         "NLP",
         "Effective Slot Filling Based on Shallow Distant Supervision Methods",
         "Text Analysis Conference",
         "2014",
         "5",
         "Workshop/Other",
         "290",
         "5128.0",
         "cs.cl"
        ],
        [
         "4",
         "This paper describes the design and implementation of a Unicode-based GUISL (Graphical User Interface for Sindhi Language). The idea is to provide a software platform to the people of Sindh as well as Sindhi diasporas living across the globe to make use of computing for basic tasks such as editing, composition, formatting, and printing of documents in Sindhi by using GUISL. The implementation of the GUISL has been done in the Java technology to make the system platform independent. The paper describes several design issues of Sindhi GUI in the context of existing software tools and technologies and explains how mapping and concatenation techniques have been employed to achieve the cursive shape of Sindhi script.",
         "1954336, 1848245, 2303781",
         "5",
         "CONCLUSIONS & FUTURE DIRECTIONS This paper has concentrated on the issue of designing and development of a GUISL. Various characteristics and issues affecting the use of Sindhi in computing have been discussed such as changing the orientation of the text from right-to-left, ordering/sorting of Sindhi Unicode characters and proper cursiveness for the ligation of words. After successful implementation of the algorithms FIG. . ON-SCREEN SINDHI SEQUENTIAL KEYBOARD FIG. . ON.SCREEN SINDHI KEYBOARD discussed, the GUI of any application can be defined in Sindhi language. The issues and methods discussed can also be applied on other languages that are written in Perso-Arabic script. This work is targeted at the provision of a generic programming framework for the development of Sindhi language based applications. Based on the present work we are now in the process of the development of a platform independent Sindhi word processor that includes localised supporting tools such as dictionaries, translation/transliteration, spell-checker, calendar and calculators. ACKNOWLEDGEMENTS Authors are indebted to acknowledge the valuable suggestions of the anonymous reviewers which helped in DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, further improving the readability of this article. The constructive discussion with officials of SLA , regarding the use of proper technical Sindhi words is also highly appreciated and acknowledged. REFERENCES",
         "INTRODUCTION T oday computers play very important role in the daily life of common people, and it is observed that the use of computers in countries other than developing ones is substantial in every walk of life. It is a fact that the more the developing countries transform their systems to adapt the use of computers the more progress and development they would achieve and hence would reduce the so called gap of ‘digital divide’. The public use of computers, however, is some how dependent on the support for the regional languages of the countries. There are thousands of languages being spoken throughout the world, and it has been noticed that the English speaking countries have an edge over other nations where English is not as their native language in use of computers. There are many countries which have adopted their native language in use of computers and succeeded to cope with computing problems. There are some countries where multiple languages are spoken such as India, Sri Lanka and Pakistan. In Pakistan Urdu, Punjabi, Sindhi, Pashto, Blochi and Siraiki etc. are spoken . Sindhi is the official language of Sindh province of Pakistan and it is also one of the constitutionally recognized languages of India. Sindhi is spoken by an estimated . million people in Pakistan and about . million people in India . During and after the partition of the subcontinent many Sindhi families also moved abroad for permanent settlement in many countries across MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, the globe; thus forming a significant community of Sindhi diasporas . Sindhi language is classified as an Indo-Aryan language belonging to Indo-Iranian branch of the Indo-European language family. Though the original script of Sindhi language found in the remnants of Mohen-jo-Daro has yet not been interpreted, the preliminary system of writing appeared before 8th century AD . The contributions of Sufi poets like Shah Abdul Latif Bhitai, Sachal Sarmast and Sami further popularized Sindhi language in literary circles in between 14th and 18th century . Since this time, the literature in Sindhi language has grown significantly and currently Sindhi is being used as medium of instruction in Sindhi majority public sector schools of province of Sindh from primary to secondary level . Dozens of daily Sindhi newspapers are being published along with several dedicated Sindhi TV channels. Besides this Sindhi is also one of the two languages used by Government of Pakistan to issue computerized national identity cards to its citizens. Though the proper use of Sindhi language in all of the above mentioned spheres requires its proper adaptation and progress in terms of modern computer based standards and models; however, too little work has yet been done for the standardization of Sindhi computing as well as development of computer based models of Sindhi script, speech and language . The major issues in Sindhi computing as investigated and observed are due to the following reasons: No localized Sindhi Software available. Non-availability of compatible Sindhi fonts for different operating systems with standard Unicode format. Complex process of enabling Sindhi through “Regional and Language Settings”. Non-compatibility of Sindhi keyboard layout for different operating systems. Lack of computer literature and books in Sindhi. No educational programs in Sindhi Computing at School and College levels. Research activities at infancy in Sindhi computing. Communication and coordination gap among researchers and professionals etc. As far as we know, so far, neither the provincial government of Sindh nor the federal government of Pakistan has initiated any significant project to fund the research on Sindhi language processing. Not only this but the complete failure of the government to control copy right system and eliminate the use of pirated software has also contributed to the lack of willingness of developers to commit any resources for the research and development of Sindhi language. As a result of this there are only a few legacy tools being used for Sindhi desktop publishing which vary from one publisher/individual to another publisher/individual. Almost all of these tools adapt English-based word processing systems and make use of their specific code pages for Sindhi character set. To the best of our knowledge, the development of a Sindhi-based GUI for the processing of Sindhi language has not yet been reported. This paper takes a step towards this aim and presents the design and development of a GUI for SL . The rest of the paper is organized as follows: Section presents the background and motivation along with a thorough discussion of the major issues in terms of Sindhi computing and also in terms of the design of the GUISL; Section presents the design and development of the GUISL and finally Section concludes the paper with added description of the future work. DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, . GUISL: DESIGN ISSUES Historically, Sindhi language has adopted a variety of writing systems based on the innovations and preferences of particular regional communities. Most of the names of the ancient scripts are either based on the names of the cities/towns or the names of the communities . During the colonial period, owing to the Sansakrit related nature of Sindhi, some British scholars advocated for the promotion of Devnagri script to be used for Sindhi. To this aim they managed to publish the translation of Bible and Sindhi- English dictionary using Devnagri script in - . This move was opposed by some government employees who were only familiar with Perso-Arabic script and their side was backed by some British officials including Captain Sir Richard Francis Burton. The matter got referred to the Court of Directors of the British East India Company. Based on the fact that the Muslim names were not properly written with Devnagri, the use of the Perso-Arabic script was recommended by the court. Following this decision, a team of scholars including Sir Burton, Munshi Thanwardas and Mirza Sadiq Ali Beg worked on the standardization of Perso-Arabic based script for Sindhi language. The standardization of this script was completed in and it consisted of letters derived from Perso- Arabic scripts with addition of dots and lines to fully represent all of the Sindhi sounds. This script is currently being used in Sindh and abroad and is based on Arabic Nashk style of writing . Figs. - show the list of these characters. Short vowels and some additional vocalic and consonantal features are also represented through diacritical marks in Sindhi as listed in Fig. . The diacritics are optionally used in writing, however, for our system development they have not been used for the sake of simplicity and readability. Thus, the design of GUISL involves numerous key factors that need to be addressed for designing the interface. Sindhi is written and read from right-to-left direction, therefore the system should have the ability to display text from right-to-left and everything of GUI should be presented from right-to- left for accommodating the Sindhi users. According to the Unicode standard the characters should be inserted and stored in a simple logical sequence . If we wish to write the word , the keys would be pressed in succession corresponding to the characters making up the word . The text file would store the Unicode codes of these characters in order as shown below. To display the text in an appropriate order of Sindhi the system needs to correspond to each of the successive FIG. . STANDARD SINDHI CHARACTERS COMMONLY USED IN SINDHI SCRIPT FIG. . ADDITION SINDHI CHARACTER USED IN SINDHI LANGUAGE FIG. . SINDHI DIACRITICS DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, code to be displayed in the right-to-left order however it would be displayed into left-to-right as shown below. This is what happens with Roman Script making it straight forward and simple to render . Thus it needs reordering or grouping with one-to-many relationship between characters. As Sindhi language is a cursive like Arabic in its written format the above text is unacceptable as neither the ordering is correct nor the shapes of the characters as per cursive nature of the script. The system should be able to display Sindhi character properly with its cursive forms while ligature used to form Sindhi words. The development of the GUISL should provide easy access for ordinary user and without any additional requirements such as Sindhi Fonts, Keyboard Layout setup, Regional Language Setting etc. These design issues pose impediments in the development of the Sindhi software system. . Design & development of the GUISL Three major issues are analyzed to design and standardize the Sindhi user interface: Unicode of characters, keyboard layout, and Sindhi cursive form. Unfortunately, there is no ordering of Sindhi Unicode characters in the Unicode plane and all the characters are scattered and mixed with Arabic and Farsi character set as shown in Fig. . The absence of a sequence and segregated set of character coding leads to develop ad- hoc based systems for process of Sindhi computing. The first step involves changing the orientation of all the GUI elements in system from right-to-left allowing the initial, middle and last representations of the characters to be rendered appropriately with proper cursive for the ligature to form the word . . Unicode Based Sindhi Characters Coding A static class was created in Java language, declaring all the Unicode based Sindhi characters to be used in GUISL as shown in pseudo-code given in the Algorithm- . The list of characters along with their variable names mapped with Unicode of each character is shown in Table . . Sindhi Cursive Form Sindhi script has a cursive form similar to that in Arabic, that is, the letters in the Sindhi script join together into units to form words. Sindhi script also has context sensitive glyph shaping; depending on whether the character joins a word in the initial, medial or final position or is isolated taking a different shape as shown in Table . Nonetheless, cursiveness, ligation and context sensitivity are rendering related issues and the output shapes of characters may FIG. . ARABIC/SINDHI UNICODE PLANE: THE ANNOTATION OF CIRCLES INDICATES THE RELATIVE POSITION OF SINDHI ALPHABET CHARACTERS IN THE PLANE . DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, vary with context, their internal encoding remains unchanged. For example, the letter È{b:Beh} may take multiple shapes but its internal encoding is always U+ as shown in Table . Therefore, these properties have no implication on collation . TABLE . LIST OF SINDHI CHARACTER, ITS CORRESPONDING UNICODE AND ITS VARIABLE NAME USED IN THE SYSTEM Variable Unicode Sindhi Veriable Unicode Sindhi Name Letter Name Letter sheen alifMadA swad alif dad beh toye beeh 067B zoye peh 067E aieen beheh ghain 063A the 062A feh theh 067F peheh 06A6 mytheey 067D qaf tteheh 067A Kaf 06AA ttay 062B keheh 06A9 jeem 062C gaf 06AF dyeh geuh 06B3 nyeh ngoeh 06B1 cheh lam cheheh meem hah 062D noon khay 062E rnoon 06BB dal 062F waw dahal 068C heh 06BE dhal 068F hamza ddal 068A Yeh 064A ddahal 068D yehSmall 06C1 zal yehHamza reh Min 06FE rdeh sindhi Ampersand 06FD zeh sceen Algorithm : Pseudo code of the static class used for the declaration of Sindhi characters and initialized with their corresponding Unicode characters. : <package> <guisl.com.snd> : <public><static><class><class.name> : <public><static><data type: char><variable.name:alif> = <value: ‘\\u0627’> : <public><static><data type: char><variable.name:beh> = <value: ‘\\u0628’> : <public><static><data type: char><variable.name:beeh> = <value: ‘\\u067B’> : <public><static><data type: char><variable.name:peh> = <value: ‘\\u067E’> : <public><static><data type: char><variable.name:beheh> = <value: ‘\\u0680’> : <public><static><data type: char><variable.name:the> = <value: ‘\\u062A’> : <public><static><data type: char><variable.name:theh> = <value: ‘\\u067F’> : <public><static><data type: char><variable.name:ttay> = <value: ‘\\u062B’> . . . . . . . . . . . . n: <end of class> TABLE . MULTIPLE GLYPH FORM OF SINDHI CHARACTERS Sindhi Isolated Last Medial Initial Unicode Name Glyph Glyph Glyph Glyph Form Form Form Form Form U+ BEG U+ NOON U+064A YEH U+062D HAH U+ SEEN U+ SAD U+ TAH DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, The Pseudo code shown in Algorithms - illustrates the main steps performed on each GUI object for displaying Sindhi text from example and respectively. Each Sindhi character is mapped to a corresponding static variable which is then concatenated with each other to form a Sindhi word as illustrated in examples and below: Sindhi cursive form: {Sindhi} Indiviual Characters {s: Seen} {n : Noon} {dh: Dhal} {y: Yeh} Example : Sindhi Word to be formed from the variable joining The Algorithm- illustrates the Pseudo.code to create a Sindhi String with multiple words being concatenated to form a statement. First each word is concatenated separately and assigned to variable and then all the variables are concatenated to create a complete statement. . GUI in Sindhi Language Using the variable joining method described in previous section Sindhi GUI architecture was designed as shown in Fig. . The system architecture contains the basic GUI elements grouped under the relevant menu. The main menu groups are written in both English and Sindhi script for ease of understanding. Fig. demonstrate the various user interface menus with Sindhi script developed for the GUISL. The Menus have been designed with Tab-Based layout instead of conventional Drop-Down menus. The File Menu shown in Fig. contains the basic tools regarding the input output operations such as , , , etc. The Fig. shows the menu which contains the tools regarding the basic editing options such as , , , etc. Similarly the menu shown in Fig. contains the options for loading various dictionary types such as Sindhi to English Dictionary, English to Sindhi Dictionary, Computer, Medical and Business Sindhi Dictionaries. Fig. contains the on-screen Sindhi keyboard for visual typing using mouse. Two keyboard types have been used in the GUISL system, one with sequential Sindhi keys as shown in Fig. and the other is the on-screen Sindhi keyboard layout as shown in Fig. . . User Convenience In the GUISL the tooltips have also been written in Sindhi- English format so that the user may get the information on a particular tool both in Sindhi and in English. Fig. Algorithm . Pseudo code of the function used for the mapping of the Sindhi character from example with its corresponding Unicode , concatenated with each other to form a Sindhi ligature. : <function> <function.name> : <variable>= String. concatenate : <GUI Object> {Change Orientation to RIGHT.TO.LEFT} : <GUI Object>.setText : <end of function> Sindhi cursive form: {Unicode Based Sindhi Word Processor} Example : Sindhi Word to be formed from the variable joining Algorithm . Pseudo code of the function used for the mapping of the Sindhi string from example with its corresponding Unicode , concatenated with each other to form a Sindhi statement. : <function> <function.name> : <variable >= String. concatenate : <variable >= String. concatenate : <variable >= String. concatenate : <variable4>= String. concatenate : : <variable >= String. concatenate : <variable >= String. concatenate : <GUI Object> {Change Orientation to RIGHT.TO.LEFT} : : <GUI Object>.setText : <end of function> DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, FIG. . THE MAIN ARCHITECTURE OF THE GUI IN SINDHI FIG. . FILE MENU FIG. . EDIT MEMU FIG. . DICTIONARY MENU DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, shows the various tooltips in Sindhi-English format. The bi-directional text right-to-left for Sindhi and left-to- right for English is used. The same method of concatenating Unicode variables from Table to English words have been used to display the text. Fig. shows the tooltip for the ‘Save File’. Similarly Fig. displays the tooltip regarding the different dictionary types. The Fig. displays the tooltip for the ‘Undo’, ‘Redo’ and ‘Exit’ options. . Message and Tool Windows All the pop-up message and tool windows in the system contain the text in both Sindhi and English language as shown in Fig. . Fig. shows the message window that appears before closing the file asking to save the changes both in Sindhi and English. Fig. displays the ‘Find and Replace’ tool Window have Sindhi-English text in the title bar of the window and in the tabs of Find and Replace options. Fig. shown below is for the task pane that displays the various tools and tasks in the system with Sindhi text on its labels and buttons. . Allana, G.A., “An Introduction to Sindhi Literature”, Sindhi Adabi Board, Jamshoro, Pakistan, . Ethnologue.com, “Sindhi: A Language of Pakistan”, Retrieved from / show_language.asp?code=snd on 26th April, . Bulchand, D., “A Manual of Sindhi”, , Revised by Joyo, M.I., Sindhi Language Authority, Hyderabad, Pakistan, . FIG. . TOOL TIP ON SAVE FILE OPTION FIG. . TOOL TIPS ON DICTIONARIES FIG. . TOOL TIP ON UNDO AND REDO AND EXIT OPTIONS. FIG. . SAVE FILE MESSAGE DIALOG FIG. . FIND DIALOG DESIGN & DEVELOPMENT OF THE GRAPHICAL USER INTERFACE FOR SINDHI LANGUAGE MEHRAN UNIVERSITY RESEARCH JOURNAL OF ENGINEERING & TECHNOLOGY, VOLUME , NO. , OCTOBER, Bhurgari, A.M., “Enabling Pakistani Languages through Unicode”, Retrieved from . microsoft. com/download////142aef9 f-1a74-4a24-b1f4- d48d41a6d/PakLang.pdf on 20th August, . The Unicode Consortium, “The Unicode Standard .”, Addison Wesley, New York, USA, . Parvez, A., “The Adaptation of the Perso-Arabic Script for Urdu, Panjabi, and Sindhi”, LCCN Permalink, New Delhi, India, . Hussain, S., and Durrani, N., “A Study on Collation of Languages from Developing Asia”, Center for Research in Urdu Language Processing, National University of Computer and Emerging Sciences, , Pakistan. Omniglot.com, “Sindhi”, Retrieved from http:// www.omniglot.com/writing/sindhi.htm on 26th April, . Al-Salman, A.S., “An Arabic Programming Environment”, ACM Symposium on Applied Computing, Volume , pp. -, October, . Sindhilanguage.com, “Sindhi Script”, URL: http:// www.sindhilanguage.com/script.html PAN Localization: A Regional Initiative to Develop Local Language Computing Capacity in Asia: http:// www.panl10n.net/english/outputs/Collation%20Book/ Collation%20Book/Fin al%20Versions/pdfs/Sindhi.pdf FIG. . TASK PANE FOR DICTIONARY",
         "Business, Medicine, Business, Computer Science, Medicine, Computer Science",
         "1e4fafaaa7109aafc3a33057dbec444826ead589",
         "7.82122905027933",
         "No",
         "graphical user interface designed developed sindhi language?",
         "NLP",
         "Design & Development of the Graphical User Interface for Sindhi Language",
         "arXiv.org",
         "2014",
         "3",
         "Workshop/Other",
         "179",
         "3977.0",
         "cs.hc cs.cl"
        ]
       ],
       "shape": {
        "columns": 19,
        "rows": 5
       }
      },
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>abstract</th>\n",
       "      <th>author_ids</th>\n",
       "      <th>citation_count</th>\n",
       "      <th>conclusion</th>\n",
       "      <th>introduction</th>\n",
       "      <th>authors_final_FOS</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>percent_societal</th>\n",
       "      <th>RQ_societal</th>\n",
       "      <th>RQ</th>\n",
       "      <th>subdomain</th>\n",
       "      <th>title</th>\n",
       "      <th>venue</th>\n",
       "      <th>year</th>\n",
       "      <th>team_size</th>\n",
       "      <th>event_type</th>\n",
       "      <th>total_sentences</th>\n",
       "      <th>article_length</th>\n",
       "      <th>categories</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>145744305, 144494734, 2226362, 1397958221</td>\n",
       "      <td>77</td>\n",
       "      <td>Discussion and Conclusion We provided an overv...</td>\n",
       "      <td>Introduction In modern electronic medical reco...</td>\n",
       "      <td>Geography, Medicine, Computer Science, Enginee...</td>\n",
       "      <td>a0c0918623392a317b944c266deacf16e186660f</td>\n",
       "      <td>9.055118</td>\n",
       "      <td>No</td>\n",
       "      <td>unified system architecture natural language p...</td>\n",
       "      <td>NLP</td>\n",
       "      <td>Natural Language Processing in Biomedicine: A ...</td>\n",
       "      <td>Methods in molecular biology</td>\n",
       "      <td>2014</td>\n",
       "      <td>4</td>\n",
       "      <td>Workshop/Other</td>\n",
       "      <td>508</td>\n",
       "      <td>7737.0</td>\n",
       "      <td>cs.cl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>In this paper we examine the usefulness of two...</td>\n",
       "      <td>None, 2883464, 2159697</td>\n",
       "      <td>2</td>\n",
       "      <td>Conclusion and Future Work To the best of our ...</td>\n",
       "      <td>Introduction Ever since the beginning of evolu...</td>\n",
       "      <td>Geography, Mathematics, Mathematics, Computer ...</td>\n",
       "      <td>c2ef17bf4344a62e6a83efd98dc2c0b160efd3e5</td>\n",
       "      <td>8.126411</td>\n",
       "      <td>No</td>\n",
       "      <td>quantitative methods used accurately predict f...</td>\n",
       "      <td>NLP</td>\n",
       "      <td>Quantitative methods for Phylogenetic Inferenc...</td>\n",
       "      <td>arXiv.org</td>\n",
       "      <td>2014</td>\n",
       "      <td>3</td>\n",
       "      <td>Workshop/Other</td>\n",
       "      <td>443</td>\n",
       "      <td>6724.0</td>\n",
       "      <td>cs.cl cs.ai</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>In this article, we investigate the properties...</td>\n",
       "      <td>1759583, 143739029</td>\n",
       "      <td>0</td>\n",
       "      <td>Conclusion In this paper, we tested if the lan...</td>\n",
       "      <td>Introduction and related work . Power laws Man...</td>\n",
       "      <td>Geography, Computer Science, Political Science...</td>\n",
       "      <td>b1cee2480e3ce3fefe40fd29640fa14d554259bf</td>\n",
       "      <td>1.683502</td>\n",
       "      <td>No</td>\n",
       "      <td>phoneme n-grams across different language fami...</td>\n",
       "      <td>NLP</td>\n",
       "      <td>Properties of phoneme N -grams across the worl...</td>\n",
       "      <td>arXiv.org</td>\n",
       "      <td>2014</td>\n",
       "      <td>2</td>\n",
       "      <td>Workshop/Other</td>\n",
       "      <td>297</td>\n",
       "      <td>4097.0</td>\n",
       "      <td>cs.cl stat.co</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Spoken Language Systems at Saarland University...</td>\n",
       "      <td>143862204, 33477621, 144036063, 40157771, 2561225</td>\n",
       "      <td>59</td>\n",
       "      <td>Conclusion The LSV English slot ﬁlling system ...</td>\n",
       "      <td>Introduction The English slot ﬁlling task of T...</td>\n",
       "      <td>Mathematics, Computer Science, Computer Scienc...</td>\n",
       "      <td>4062a185b0aeb76626b13a59814128dd2a26ddc3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>No</td>\n",
       "      <td>shallow distant supervision methods effective ...</td>\n",
       "      <td>NLP</td>\n",
       "      <td>Effective Slot Filling Based on Shallow Distan...</td>\n",
       "      <td>Text Analysis Conference</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>Workshop/Other</td>\n",
       "      <td>290</td>\n",
       "      <td>5128.0</td>\n",
       "      <td>cs.cl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>This paper describes the design and implementa...</td>\n",
       "      <td>1954336, 1848245, 2303781</td>\n",
       "      <td>5</td>\n",
       "      <td>CONCLUSIONS &amp; FUTURE DIRECTIONS This paper has...</td>\n",
       "      <td>INTRODUCTION T oday computers play very import...</td>\n",
       "      <td>Business, Medicine, Business, Computer Science...</td>\n",
       "      <td>1e4fafaaa7109aafc3a33057dbec444826ead589</td>\n",
       "      <td>7.821229</td>\n",
       "      <td>No</td>\n",
       "      <td>graphical user interface designed developed si...</td>\n",
       "      <td>NLP</td>\n",
       "      <td>Design &amp; Development of the Graphical User Int...</td>\n",
       "      <td>arXiv.org</td>\n",
       "      <td>2014</td>\n",
       "      <td>3</td>\n",
       "      <td>Workshop/Other</td>\n",
       "      <td>179</td>\n",
       "      <td>3977.0</td>\n",
       "      <td>cs.hc cs.cl</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            abstract  \\\n",
       "0                                                NaN   \n",
       "1  In this paper we examine the usefulness of two...   \n",
       "2  In this article, we investigate the properties...   \n",
       "3  Spoken Language Systems at Saarland University...   \n",
       "4  This paper describes the design and implementa...   \n",
       "\n",
       "                                          author_ids  citation_count  \\\n",
       "0          145744305, 144494734, 2226362, 1397958221              77   \n",
       "1                             None, 2883464, 2159697               2   \n",
       "2                                 1759583, 143739029               0   \n",
       "3  143862204, 33477621, 144036063, 40157771, 2561225              59   \n",
       "4                          1954336, 1848245, 2303781               5   \n",
       "\n",
       "                                          conclusion  \\\n",
       "0  Discussion and Conclusion We provided an overv...   \n",
       "1  Conclusion and Future Work To the best of our ...   \n",
       "2  Conclusion In this paper, we tested if the lan...   \n",
       "3  Conclusion The LSV English slot ﬁlling system ...   \n",
       "4  CONCLUSIONS & FUTURE DIRECTIONS This paper has...   \n",
       "\n",
       "                                        introduction  \\\n",
       "0  Introduction In modern electronic medical reco...   \n",
       "1  Introduction Ever since the beginning of evolu...   \n",
       "2  Introduction and related work . Power laws Man...   \n",
       "3  Introduction The English slot ﬁlling task of T...   \n",
       "4  INTRODUCTION T oday computers play very import...   \n",
       "\n",
       "                                   authors_final_FOS  \\\n",
       "0  Geography, Medicine, Computer Science, Enginee...   \n",
       "1  Geography, Mathematics, Mathematics, Computer ...   \n",
       "2  Geography, Computer Science, Political Science...   \n",
       "3  Mathematics, Computer Science, Computer Scienc...   \n",
       "4  Business, Medicine, Business, Computer Science...   \n",
       "\n",
       "                                   paper_id  percent_societal RQ_societal  \\\n",
       "0  a0c0918623392a317b944c266deacf16e186660f          9.055118          No   \n",
       "1  c2ef17bf4344a62e6a83efd98dc2c0b160efd3e5          8.126411          No   \n",
       "2  b1cee2480e3ce3fefe40fd29640fa14d554259bf          1.683502          No   \n",
       "3  4062a185b0aeb76626b13a59814128dd2a26ddc3          0.000000          No   \n",
       "4  1e4fafaaa7109aafc3a33057dbec444826ead589          7.821229          No   \n",
       "\n",
       "                                                  RQ subdomain  \\\n",
       "0  unified system architecture natural language p...       NLP   \n",
       "1  quantitative methods used accurately predict f...       NLP   \n",
       "2  phoneme n-grams across different language fami...       NLP   \n",
       "3  shallow distant supervision methods effective ...       NLP   \n",
       "4  graphical user interface designed developed si...       NLP   \n",
       "\n",
       "                                               title  \\\n",
       "0  Natural Language Processing in Biomedicine: A ...   \n",
       "1  Quantitative methods for Phylogenetic Inferenc...   \n",
       "2  Properties of phoneme N -grams across the worl...   \n",
       "3  Effective Slot Filling Based on Shallow Distan...   \n",
       "4  Design & Development of the Graphical User Int...   \n",
       "\n",
       "                          venue  year  team_size      event_type  \\\n",
       "0  Methods in molecular biology  2014          4  Workshop/Other   \n",
       "1                     arXiv.org  2014          3  Workshop/Other   \n",
       "2                     arXiv.org  2014          2  Workshop/Other   \n",
       "3      Text Analysis Conference  2014          5  Workshop/Other   \n",
       "4                     arXiv.org  2014          3  Workshop/Other   \n",
       "\n",
       "   total_sentences  article_length     categories  \n",
       "0              508          7737.0          cs.cl  \n",
       "1              443          6724.0    cs.cl cs.ai  \n",
       "2              297          4097.0  cs.cl stat.co  \n",
       "3              290          5128.0          cs.cl  \n",
       "4              179          3977.0    cs.hc cs.cl  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f4c2d988",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(os.path.join(DATA_DIR, \"June2025_final_data.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3925c95c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_and_preprocess_data(file_path):\n",
    "    df = pd.read_csv(file_path)\n",
    "    print(f\"Initial shape: {df.shape}\")\n",
    "    \n",
    "    # Drop duplicates\n",
    "    df = df.drop_duplicates(subset='paper_id')\n",
    "    print(f\"After removing duplicates: {df.shape}\")\n",
    "    \n",
    "    # Filter out very short papers\n",
    "    df = df[df['total_sentences'] > 4].copy()\n",
    "    print(f\"After filtering short papers: {df.shape}\")\n",
    "    \n",
    "    # Create clean outcome variable\n",
    "    df['percent_societal'] = df['percent_societal_new']\n",
    "    \n",
    "    return df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "societal_influence_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
